diff --git a/README.md b/README.md index 44c8a4cb..91f9f67e 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,16 @@ The canonical git server for this project is [https://git.marginalia.nu](https:/ It is fine to mirror it on other hosts, but if you have issues or questions git.marginalia.nu is where you want to go. -As it stands now, the project is still being set up and is a bit of a mess as -it wasn't developed with the intention of going open source, a lot of tests -and so on make assumptions about the directory structure, much configuration -is hard coded and so on. Please stand by. A lot of the mess is fairly superficial. +## Important note about wmsa.local + +This project has a [sister repository called wmsa.local](https://git.marginalia.nu/marginalia/wmsa.local) +that contains scripts and configuration files for running and developing the code. + +Without it, development is very unpleasant. + +While developing the code, you will want an environment variable WMSA_HOME pointing to +the directory in which wmsa.local is checked out, otherwise the code will not run and +several tests will fail. ## Documentation diff --git a/build.gradle b/build.gradle index c13b58da..ffe47e69 100644 --- a/build.gradle +++ b/build.gradle @@ -56,19 +56,7 @@ test { forkEvery = 1 maxHeapSize = "8G" useJUnitPlatform { - excludeTags "db" excludeTags "nobuild" } } -task dbTest(type: Test) { - maxParallelForks = 1 - forkEvery = 1 - maxHeapSize = "8G" - - useJUnitPlatform { - includeTags "db" - } -} - - diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 41dfb879..8049c684 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-7.5-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/gradlew.bat b/gradlew.bat index 107acd32..ac1b06f9 100644 --- a/gradlew.bat +++ b/gradlew.bat @@ -1,89 +1,89 @@ -@rem -@rem Copyright 2015 the original author or authors. -@rem -@rem Licensed under the Apache License, Version 2.0 (the "License"); -@rem you may not use this file except in compliance with the License. -@rem You may obtain a copy of the License at -@rem -@rem https://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, software -@rem distributed under the License is distributed on an "AS IS" BASIS, -@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -@rem See the License for the specific language governing permissions and -@rem limitations under the License. -@rem - -@if "%DEBUG%" == "" @echo off -@rem ########################################################################## -@rem -@rem Gradle startup script for Windows -@rem -@rem ########################################################################## - -@rem Set local scope for the variables with windows NT shell -if "%OS%"=="Windows_NT" setlocal - -set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. -set APP_BASE_NAME=%~n0 -set APP_HOME=%DIRNAME% - -@rem Resolve any "." and ".." in APP_HOME to make it shorter. -for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi - -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" - -@rem Find java.exe -if defined JAVA_HOME goto findJavaFromJavaHome - -set JAVA_EXE=java.exe -%JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto execute - -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:findJavaFromJavaHome -set JAVA_HOME=%JAVA_HOME:"=% -set JAVA_EXE=%JAVA_HOME%/bin/java.exe - -if exist "%JAVA_EXE%" goto execute - -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:execute -@rem Setup the command line - -set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar - - -@rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* - -:end -@rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd - -:fail -rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of -rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 - -:mainEnd -if "%OS%"=="Windows_NT" endlocal - -:omega +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index 7e0884db..e1c96de0 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -4,6 +4,8 @@ plugins { id "me.champeau.jmh" version "0.6.6" id "de.undercouch.download" version "5.1.0" + + id 'jvm-test-suite' } repositories { @@ -63,22 +65,20 @@ dependencies { implementation 'org.projectlombok:lombok:1.18.24' annotationProcessor 'org.projectlombok:lombok:1.18.24' - implementation 'com.github.jknack:handlebars:4.3.0' + implementation 'com.github.jknack:handlebars:4.3.1' implementation 'com.github.jknack:handlebars-markdown:4.2.1' implementation group: 'com.google.code.gson', name: 'gson', version: '2.9.0' - implementation 'io.reactivex.rxjava3:rxjava:3.1.4' + implementation 'io.reactivex.rxjava3:rxjava:3.1.5' implementation "com.sparkjava:spark-core:2.9.3" implementation 'com.opencsv:opencsv:5.6' - implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2' - implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2' - implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2' implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2' implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2' implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2' implementation 'org.slf4j:slf4j-api:1.7.36' + testImplementation 'org.slf4j:slf4j-jdk14:2.0.3' implementation 'com.google.guava:guava:31.1-jre' implementation 'com.google.inject:guice:5.1.0' @@ -89,19 +89,19 @@ dependencies { implementation group: 'com.h2database', name: 'h2', version: '2.1.210' - implementation 'org.jsoup:jsoup:1.14.3' + implementation 'org.jsoup:jsoup:1.15.3' implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2' - implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.4' + implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.6' implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3' implementation 'com.zaxxer:HikariCP:5.0.1' implementation 'org.apache.opennlp:opennlp-tools:1.9.4' - implementation 'io.prometheus:simpleclient:0.15.0' - implementation 'io.prometheus:simpleclient_servlet:0.15.0' - implementation 'io.prometheus:simpleclient_httpserver:0.15.0' - implementation 'io.prometheus:simpleclient_hotspot:0.15.0' + implementation 'io.prometheus:simpleclient:0.16.0' + implementation 'io.prometheus:simpleclient_servlet:0.16.0' + implementation 'io.prometheus:simpleclient_httpserver:0.16.0' + implementation 'io.prometheus:simpleclient_hotspot:0.16.0' implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3' implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30' @@ -114,7 +114,7 @@ dependencies { implementation 'org.imgscalr:imgscalr-lib:4.2' implementation 'org.jclarion:image4j:0.7' - implementation 'commons-net:commons-net:3.6' + implementation 'commons-net:commons-net:3.8.0' implementation 'org.eclipse.jgit:org.eclipse.jgit:5.12.0.202106070339-r' implementation 'org.eclipse.jgit:org.eclipse.jgit.ssh.jsch:5.12.0.202106070339-r' implementation 'com.jcraft:jsch:0.1.55' @@ -123,12 +123,14 @@ dependencies { implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0' implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8' - implementation 'org.roaringbitmap:RoaringBitmap:0.9.27' + implementation 'org.roaringbitmap:RoaringBitmap:0.9.32' + implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29' implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0' testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' + testImplementation 'org.mockito:mockito-junit-jupiter:4.5.1' testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' testCompileOnly 'org.projectlombok:lombok:1.18.24' testImplementation 'org.projectlombok:lombok:1.18.24' @@ -136,23 +138,23 @@ dependencies { testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1' - testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2') - testImplementation 'org.testcontainers:mariadb:1.17.2' - testImplementation "org.testcontainers:junit-jupiter:1.17.2" + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' - e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' + e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.9.0' e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' e2eTestImplementation 'org.projectlombok:lombok:1.18.24' e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24' - e2eTestImplementation 'org.testcontainers:nginx:1.17.3' + e2eTestImplementation 'org.testcontainers:nginx:1.17.4' e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2" - e2eTestImplementation 'org.testcontainers:selenium:1.17.3' - e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.2.1' - e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.2.1' + e2eTestImplementation 'org.testcontainers:selenium:1.17.4' + e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.5.3' + e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3' - implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4' - implementation 'org.seleniumhq.selenium:selenium-java:4.3.0' + implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3' + implementation 'org.seleniumhq.selenium:selenium-java:4.5.3' implementation 'org.sejda.imageio:webp-imageio:0.1.6' jmh 'org.openjdk.jmh:jmh-core:1.35' @@ -167,23 +169,17 @@ configurations { } - test { - maxParallelForks = 16 - forkEvery = 1 + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 maxHeapSize = "8G" - useJUnitPlatform { - excludeTags "db" - } + useJUnitPlatform() } -task dbTest(type: Test) { - maxParallelForks = 1 - forkEvery = 1 +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 maxHeapSize = "8G" - useJUnitPlatform { - includeTags "db" + excludeTags "slow" } } @@ -243,9 +239,9 @@ task IP2LocationFile(type: Copy) { into outputDir } -task downloadTermFreqData(type: Copy) { - // TODO: Need hosting for this file - from '/var/lib/wmsa/model/tfreq-new-algo3.bin' - into 'data/models/' +task downloadTermFreqData(type: Download) { + src 'https://downloads.marginalia.nu/model/tfreq-new-algo3.bin' + dest file('data/models/tfreq-new-algo3.bin') + overwrite false } diff --git a/marginalia_nu/src/e2e/resources/init.sh b/marginalia_nu/src/e2e/resources/init.sh index 784b6bd4..2b3bfa9d 100644 --- a/marginalia_nu/src/e2e/resources/init.sh +++ b/marginalia_nu/src/e2e/resources/init.sh @@ -70,4 +70,4 @@ dating dating EOF echo "*** Starting $1" -WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file +WMSA_HOME=${HOME} java -server -ea -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java b/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java index ccc3f1c6..08caa671 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java @@ -27,6 +27,7 @@ public class AndCardIntSet { public static AndCardIntSet of(RoaringBitmap bmap) { TIntArrayList lst = new TIntArrayList(bmap.getCardinality()); + lst.addAll(bmap.toArray()); return new AndCardIntSet(lst); @@ -37,7 +38,7 @@ public class AndCardIntSet { backingList = list; hash = 0; - if (list.size() < 128) { + if (list.size() < 32) { for (int v : list.toArray()) { int bit = hasher.hashInt(v).asInt() % 64; hash |= (1L << bit); @@ -56,7 +57,7 @@ public class AndCardIntSet { return false; } - if (backingList.size() < 128) { + if (backingList.size() < 32) { int bit = hasher.hashInt(val).asInt() % 64; hash |= (1L << bit); } @@ -81,10 +82,10 @@ public class AndCardIntSet { if (!testHash(a,b)) { return 0; } - - if (a.getCardinality() + b.getCardinality() < 10) { - return andLinearSmall(a, b); - } +// +// if (a.getCardinality() + b.getCardinality() < 10) { +// return andLinearSmall(a, b); +// } return andLinear(a,b); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ByteFolder.java b/marginalia_nu/src/main/java/nu/marginalia/util/ByteFolder.java deleted file mode 100644 index 0406e06c..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ByteFolder.java +++ /dev/null @@ -1,80 +0,0 @@ -package nu.marginalia.util; - -public class ByteFolder { - - public byte[] foldBytes(int p, int q) { - - int pw = bitWidth(p); - int qw = bitWidth(q); - int qpw = qw + pw; - - long qp = Integer.toUnsignedLong(q) << pw | Integer.toUnsignedLong(p); - - int qpwBytes = ((qpw - 1) / Byte.SIZE) + 1; - - byte[] bytes = new byte[qpwBytes + 1]; - bytes[0] = (byte) pw; - for (int i = 1; i < bytes.length; i++) { - bytes[i] = (byte) (qp >>> (qpwBytes - i) * Byte.SIZE & 0xff); - } - - return bytes; - } - - // Function such that (decodeBytes o foldBytes) = identity - public static int[] decodeBytes(byte[] data) { - int[] dest = new int[2]; - decodeBytes(data, data.length, dest); - return dest; - } - - public static void decodeBytes(byte[] data, int length, int[] dest) { - long val = 0; - - for (int i = 1; i < length; i++) { - val = (val << 8) | ((0xFF)&data[i]); - } - - dest[1] = (int)(val >>> data[0]); - dest[0] = (int)(val & ~(dest[1]<= 0; i--) { - s.append((b[j] & (1L << i)) > 0 ? 1 : 0); - } - } - return s.toString(); - } - public static String intBits(int v) { - StringBuilder s = new StringBuilder(); - for (int i = 32; i >=0; i--) { - s.append((v & (1L << i)) > 0 ? 1 : 0); - } - return s.toString(); - } - public static String longBits(long v) { - StringBuilder s = new StringBuilder(); - for (int i = 64; i >=0; i--) { - s.append((v & (1L << i)) > 0 ? 1 : 0); - } - return s.toString(); - } - - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java index 92190c49..a52f9e63 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java @@ -6,37 +6,32 @@ import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; -import java.io.RandomAccessFile; import java.nio.ByteBuffer; +import java.nio.channels.ByteChannel; import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; -/** For managing random writes on SSDs - * - * See https://en.wikipedia.org/wiki/Write_amplification +/** For managing random writes on SSDs. + * Because SSDs do not deal well with random small writes, + * see https://en.wikipedia.org/wiki/Write_amplification, + * it is beneficial to pigeonhole the writes first + * within the same general region * */ public class RandomWriteFunnel implements AutoCloseable { private static final Logger logger = LoggerFactory.getLogger(RandomWriteFunnel.class); - private final DataBin[] bins; - + private final ArrayList bins; + private final Path tempDir; private final int binSize; - public RandomWriteFunnel(Path tempDir, long size, int binSize) throws IOException { + public RandomWriteFunnel(Path tempDir, int binSize) throws IOException { this.binSize = binSize; + this.tempDir = tempDir; - if (size > 0) { - int binCount = (int) (size / binSize + ((size % binSize) != 0L ? 1 : 0)); - bins = new DataBin[binCount]; - for (int i = 0; i < binCount; i++) { - bins[i] = new DataBin(tempDir, (int) - Math.min((size - (long)binSize * i), binSize)); - } - } - else { - bins = new DataBin[0]; - } + bins = new ArrayList<>(); } @SneakyThrows @@ -44,10 +39,21 @@ public class RandomWriteFunnel implements AutoCloseable { int bin = (int)(address / binSize); int offset = (int)(address%binSize); - bins[bin].put(offset, data); + if (bin >= bins.size()) { + grow(bin); + } + + bins.get(bin).put(offset, data); } - public void write(FileChannel o) throws IOException { + @SneakyThrows + private void grow(int bin) { + while (bins.size() <= bin) { + bins.add(new DataBin(tempDir, binSize)); + } + } + + public void write(ByteChannel o) throws IOException { ByteBuffer buffer = ByteBuffer.allocateDirect(binSize*8); for (var bin : bins) { @@ -67,7 +73,7 @@ public class RandomWriteFunnel implements AutoCloseable { } } - static class DataBin implements AutoCloseable { + static class DataBin { private final ByteBuffer buffer; private final int size; private final FileChannel channel; @@ -77,7 +83,7 @@ public class RandomWriteFunnel implements AutoCloseable { buffer = ByteBuffer.allocateDirect(360_000); this.size = size; file = Files.createTempFile(tempDir, "scatter-writer", ".dat").toFile(); - channel = new RandomAccessFile(file, "rw").getChannel(); + channel = (FileChannel) Files.newByteChannel(file.toPath(), StandardOpenOption.WRITE, StandardOpenOption.READ); } void put(int address, long data) throws IOException { @@ -133,7 +139,6 @@ public class RandomWriteFunnel implements AutoCloseable { } } - @Override public void close() throws IOException { channel.close(); file.delete(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java b/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java new file mode 100644 index 00000000..5911a497 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java @@ -0,0 +1,28 @@ +package nu.marginalia.util; + +import java.util.HashMap; + +public class StringPool { + private final HashMap words; + + public StringPool() { + this.words = new HashMap<>(1000); + } + + public StringPool(int capacity) { + words = new HashMap<>(capacity); + } + + public String internalize(String str) { + final String ret = words.putIfAbsent(str, str); + + if (null == ret) + return str; + + return ret; + } + + public void flush() { + words.clear(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/TransformList.java b/marginalia_nu/src/main/java/nu/marginalia/util/TransformList.java new file mode 100644 index 00000000..f8aac39e --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/TransformList.java @@ -0,0 +1,111 @@ +package nu.marginalia.util; + +import java.util.List; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Predicate; + +public class TransformList { + private final List backingList; + + public TransformList(List backingList) { + this.backingList = backingList; + } + + public void transformEach(Consumer consumer) { + for (var iter = backingList.listIterator(); iter.hasNext(); ) { + var entity = new Entity(iter.next()); + consumer.accept(entity); + if (entity.action == Action.REPLACE) { + iter.set(entity.value); + } + else if (entity.action == Action.REMOVE) { + iter.remove(); + } + } + } + + public void transformEachPair(BiConsumer consumer) { + for (var iter = backingList.listIterator(); iter.hasNext(); ) { + var firstEntity = new Entity(iter.next()); + if (!iter.hasNext()) break; + var secondEntry = new Entity(backingList.get(iter.nextIndex())); + + consumer.accept(firstEntity, secondEntry); + if (firstEntity.action == Action.REPLACE) { + iter.set(firstEntity.value); + + if (secondEntry.action == Action.REPLACE) { + backingList.set(iter.nextIndex(), secondEntry.value); + } + else if (secondEntry.action == Action.REMOVE) { + iter.next(); + iter.remove(); + } + } + else if (firstEntity.action == Action.REMOVE) { + if (secondEntry.action == Action.REPLACE) { + backingList.set(iter.nextIndex(), secondEntry.value); + } + + iter.remove(); + + if (secondEntry.action == Action.REMOVE) { + iter.next(); + iter.remove(); + } + } + + } + } + + public void scan(Predicate start, Predicate end, Consumer> inbetween) { + for (int i = 0; i < backingList.size(); i++) { + if (start.test(backingList.get(i))) { + for (int j = i + 1; j < backingList.size(); j++) { + if (end.test(backingList.get(j))) { + inbetween.accept(new TransformList<>(backingList.subList(i, j+1))); + break; + } + } + } + } + } + + public void scanAndTransform(Predicate start, Predicate end, Consumer inbetweenConsumer) { + scan(start, end, range -> range.transformEach(inbetweenConsumer)); + } + + public int size() { + return backingList.size(); + } + + public List getBackingList() { + return backingList; + } + + + public class Entity { + public T value; + private Action action; + + Entity(T value) { + this.value = value; + } + + public void replace(T newValue) { + action = Action.REPLACE; + value = newValue; + } + + public void remove() { + action = Action.REMOVE; + } + } + + enum Action { + NO_OP, + REPLACE, + REMOVE + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/IntArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/IntArray.java new file mode 100644 index 00000000..f86d536f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/IntArray.java @@ -0,0 +1,64 @@ +package nu.marginalia.util.array; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.algo.IntArrayBase; +import nu.marginalia.util.array.algo.IntArraySearch; +import nu.marginalia.util.array.algo.IntArraySort; +import nu.marginalia.util.array.algo.IntArrayTransformations; +import nu.marginalia.util.array.delegate.ShiftedIntArray; +import nu.marginalia.util.array.page.IntArrayPage; +import nu.marginalia.util.array.page.PagingIntArray; +import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public interface IntArray extends IntArrayBase, IntArrayTransformations, IntArraySearch, IntArraySort { + int WORD_SIZE = 4; + + ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME + = ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE); + + int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 16; + + static IntArray allocate(long size) { + if (size < MAX_CONTINUOUS_SIZE) { + return IntArrayPage.onHeap((int) size); + } + + return PagingIntArray.newOnHeap(DEFAULT_PARTITIONING_SCHEME, size); + } + + static IntArray mmapRead(Path path) throws IOException { + long sizeBytes = Files.size(path); + + if (sizeBytes < MAX_CONTINUOUS_SIZE) { + return IntArrayPage.fromMmapReadOnly(path, 0, (int) sizeBytes / 4); + } + + return PagingIntArray.mapFileReadOnly(DEFAULT_PARTITIONING_SCHEME, path); + } + + static IntArray mmapForWriting(Path path) throws IOException { + return PagingIntArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path); + } + + static IntArray mmapForWriting(Path path, long size) throws IOException { + return PagingIntArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path, size); + } + + default ShiftedIntArray shifted(long offset) { + return new ShiftedIntArray(offset, this); + } + default ShiftedIntArray range(long start, long end) { + return new ShiftedIntArray(start, end, this); + } + + void force(); + + + void advice(NativeIO.Advice advice) throws IOException; + void advice(NativeIO.Advice advice, long start, long end) throws IOException; + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/LongArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/LongArray.java new file mode 100644 index 00000000..82543f4a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/LongArray.java @@ -0,0 +1,63 @@ +package nu.marginalia.util.array; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.algo.LongArrayBase; +import nu.marginalia.util.array.algo.LongArraySearch; +import nu.marginalia.util.array.algo.LongArraySort; +import nu.marginalia.util.array.algo.LongArrayTransformations; +import nu.marginalia.util.array.delegate.ShiftedLongArray; +import nu.marginalia.util.array.page.LongArrayPage; +import nu.marginalia.util.array.page.PagingLongArray; +import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + + +public interface LongArray extends LongArrayBase, LongArrayTransformations, LongArraySearch, LongArraySort { + int WORD_SIZE = 8; + + ArrayPartitioningScheme DEFAULT_PARTITIONING_SCHEME + = ArrayPartitioningScheme.forPartitionSize(Integer.getInteger("wmsa.page-size",1<<30) / WORD_SIZE); + + int MAX_CONTINUOUS_SIZE = Integer.MAX_VALUE/WORD_SIZE - 8; + + static LongArray allocate(long size) { + if (size < MAX_CONTINUOUS_SIZE) { + return LongArrayPage.onHeap((int) size); + } + + return PagingLongArray.newOnHeap(DEFAULT_PARTITIONING_SCHEME, size); + } + + static LongArray mmapRead(Path path) throws IOException { + long sizeBytes = Files.size(path); + + if (sizeBytes < MAX_CONTINUOUS_SIZE) { + return LongArrayPage.fromMmapReadOnly(path, 0, (int) sizeBytes / 8); + } + + return PagingLongArray.mapFileReadOnly(DEFAULT_PARTITIONING_SCHEME, path); + } + + static LongArray mmapForWriting(Path path) throws IOException { + return PagingLongArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path); + } + + static LongArray mmapForWriting(Path path, long size) throws IOException { + return PagingLongArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path, size); + } + + default ShiftedLongArray shifted(long offset) { + return new ShiftedLongArray(offset, this); + } + default ShiftedLongArray range(long start, long end) { + return new ShiftedLongArray(start, end, this); + } + + void force(); + + void advice(NativeIO.Advice advice) throws IOException; + void advice(NativeIO.Advice advice, long start, long end) throws IOException; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java new file mode 100644 index 00000000..d01d3716 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java @@ -0,0 +1,6 @@ +package nu.marginalia.util.array.algo; + +public interface BulkTransferArray { + + void set(long start, long end, BufferType buffer, int bufferStart); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java new file mode 100644 index 00000000..bf5249a6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java @@ -0,0 +1,69 @@ +package nu.marginalia.util.array.algo; + +import java.io.IOException; +import java.nio.IntBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; + +public interface IntArrayBase extends BulkTransferArray { + int get(long pos); + + void set(long pos, int value); + + long size(); + + default void fill(long start, long end, int val) { + for (long v = start; v < end; v++) { + set(v, val); + } + } + + default void increment(long pos) { + set(pos, get(pos) + 1); + } + + default void swap(long pos1, long pos2) { + int tmp = get(pos1); + set(pos1, get(pos2)); + set(pos2, tmp); + } + + default void swapn(int n, long pos1, long pos2) { + for (int i = 0; i < n; i++) { + int tmp = get(pos1+i); + set(pos1+i, get(pos2+i)); + set(pos2+i, tmp); + } + } + + default int getAndIncrement(long pos) { + int val = get(pos); + set(pos, val + 1); + return val; + } + + default void set(long start, long end, IntBuffer buffer, int bufferStart) { + for (int i = 0; i < (end-start); i++) { + set(start+i, buffer.get(i + bufferStart)); + } + } + default void get(long start, long end, IntBuffer buffer, int bufferStart) { + for (int i = 0; i < (end-start); i++) { + buffer.put(i + bufferStart, get(start + i)); + } + } + + default void get(long start, IntBuffer buffer) { + get(start, start + buffer.remaining(), buffer, buffer.position()); + } + + default void get(long start, long end, int[] buffer) { + for (int i = 0; i < (end-start); i++) { + buffer[i] = get(start + i); + } + } + + void write(Path file) throws IOException; + + void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySearch.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySearch.java new file mode 100644 index 00000000..104c5800 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySearch.java @@ -0,0 +1,126 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.buffer.IntQueryBuffer; + +import static nu.marginalia.util.array.algo.LongArraySearch.encodeSearchMiss; + +public interface IntArraySearch extends IntArrayBase { + + int LINEAR_SEARCH_CUTOFF = 64; + + default long linearSearch(int key, long fromIndex, long toIndex) { + long pos; + + for (pos = fromIndex; pos < toIndex; pos++) { + int val = get(pos); + + if (val == key) return pos; + if (val > key) break; + } + + return encodeSearchMiss(pos - 1); + } + + + + default long binarySearch(int key, long fromIndex, long toIndex) { + long low = 0; + long high = (toIndex - fromIndex) - 1; + + while (high - low >= LINEAR_SEARCH_CUTOFF) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + return linearSearch(key, fromIndex + low, fromIndex + high + 1); + } + + default long binarySearchUpperBound(int key, long fromIndex, long toIndex) { + long low = 0; + long high = (toIndex - fromIndex) - 1; + + while (high - low >= LINEAR_SEARCH_CUTOFF) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + + for (fromIndex += low; fromIndex < toIndex; fromIndex++) { + if (get(fromIndex) >= key) return fromIndex; + } + + return toIndex; + } + + + default void retain(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + + if (searchStart >= searchEnd) return; + + int bv = buffer.currentValue(); + int av = get(searchStart); + long pos = searchStart; + + while (bv <= boundary && buffer.hasMore()) { + if (bv < av) { + if (!buffer.rejectAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + else if (bv == av) { + if (!buffer.retainAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + + if (++pos < searchEnd) { + av = get(pos); + } + else { + break; + } + } + } + + default void reject(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + + if (searchStart >= searchEnd) return; + + int bv = buffer.currentValue(); + int av = get(searchStart); + long pos = searchStart; + + while (bv <= boundary && buffer.hasMore()) { + if (bv < av) { + if (!buffer.retainAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + else if (bv == av) { + if (!buffer.rejectAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + + if (++pos < searchEnd) { + av = get(pos); + } + else { + break; + } + } + + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySort.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySort.java new file mode 100644 index 00000000..e6ba6c87 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArraySort.java @@ -0,0 +1,174 @@ +package nu.marginalia.util.array.algo; + +import java.io.IOException; +import java.nio.IntBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public interface IntArraySort extends IntArrayBase { + + default boolean isSorted(long start, long end) { + if (start == end) return true; + + int val = get(start); + for (long i = start + 1; i < end; i++) { + int next = get(i); + if (next < val) + return false; + val = next; + } + + return true; + } + + default void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException { + long size = end - start; + + if (size < ctx.memorySortLimit()) { + quickSort(start, end); + } + else { + mergeSort(start, end, ctx.tempDir()); + } + } + + default boolean isSortedN(int wordSize, long start, long end) { + if (start == end) return true; + + int val = get(start); + for (long i = start + wordSize; i < end; i+=wordSize) { + int next = get(i); + if (next < val) + return false; + val = next; + } + + return true; + } + + + + default void insertionSort(long start, long end) { + assert end - start < Integer.MAX_VALUE; + + int n = (int) (end - start); + + if (n <= 1) { + return; + } + + for (int i = 1; i < n; i++) { + int key = get(start + i); + + int j = i - 1; + while (j >= 0 && get(start + j) > key) { + swap( start + j, start + (long)(j+1)); + j--; + } + set(start + j+1, key); + } + } + + default void quickSort(long start, long end) { + if (end - start < 64) { + insertionSort(start, end); + } + else { + _quickSortLH(start, end - 1); + } + } + + default void _quickSortLH(long low, long highInclusive) { + + if (low < 0 || highInclusive < 0 || low >= highInclusive) + return; + + if (highInclusive - low < 32) { + insertionSort(low, highInclusive + 1); + return; + } + + long p = _quickSortPartition(low, highInclusive); + + _quickSortLH(low, p); + _quickSortLH(p + 1, highInclusive); + } + + + default long _quickSortPartition(long low, long high) { + + long pivotPoint = ((low + high) / (2L)); + int pivot = get(pivotPoint); + + long i = low - 1; + long j = high + 1; + + for (;;) { + do { + i+=1; + } while (get(i) < pivot); + + do { + j-=1; + } + while (get(j) > pivot); + + if (i >= j) return j; + else swap(i, j); + } + } + + default void mergeSort(long start, long end, Path tmpDir) throws IOException { + int length = (int) (end - start); + + Path tmpFile = Files.createTempFile(tmpDir,"sort-"+start+"-"+(start+length), ".dat"); + try (var channel = (FileChannel) Files.newByteChannel(tmpFile, StandardOpenOption.WRITE, StandardOpenOption.READ)) { + var workBuffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, 4L * length).asIntBuffer(); + + _mergeSort(start, length, workBuffer); + } + finally { + Files.delete(tmpFile); + } + } + + default void _mergeSort(long start, int length, IntBuffer workBuffer) { + int width = Math.min(Integer.highestOneBit(length), 1 << 16); + + // Do in-memory sorting up until internalSortLimit first + for (int i = 0; i < length; i += width) { + quickSort(start + i, start + i + Math.min(width, length-i)); + } + + // Then finish with merge sort + for (width = 1; width < length; width*=2) { + + for (int i = 0; i < length; i += 2*width) { + _merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer); + } + + workBuffer.clear(); + set(start, start + length, workBuffer, 0); + } + + } + + + default void _merge(long offset, int left, int right, int end, IntBuffer workBuffer) { + long idxL = left; + long idxR = right; + + for (int putPos = left; putPos < end; putPos++) { + if (idxL < right && (idxR >= end || get(offset+idxL) < get(offset+idxR))) { + workBuffer.put(putPos, get(offset+idxL)); + idxL++; + } + else { + workBuffer.put(putPos, get(offset+idxR)); + idxR++; + } + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayTransformations.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayTransformations.java new file mode 100644 index 00000000..c087f60e --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayTransformations.java @@ -0,0 +1,40 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.functional.IntBinaryIOOperation; +import nu.marginalia.util.array.functional.IntIOTransformer; +import nu.marginalia.util.array.functional.IntTransformer; +import nu.marginalia.util.array.functional.LongIntConsumer; + +import java.io.IOException; + +public interface IntArrayTransformations extends IntArrayBase { + + default void forEach(long start, long end, LongIntConsumer consumer) { + for (long i = start; i < end; i++) { + consumer.accept(i, get(i)); + } + } + + default void transformEach(long start, long end, IntTransformer transformer) { + for (long i = start; i < end; i++) { + set(i, transformer.transform(i, get(i))); + } + } + + default void transformEachIO(long start, long end, IntIOTransformer transformer) throws IOException { + for (long i = start; i < end; i++) { + set(i, transformer.transform(i, get(i))); + } + } + + default int foldIO(int zero, long start, long end, IntBinaryIOOperation operator) throws IOException { + int accumulator = zero; + + for (long i = start; i < end; i++) { + accumulator = operator.apply(accumulator, get(i)); + } + + return accumulator; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java new file mode 100644 index 00000000..03f18bcc --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java @@ -0,0 +1,69 @@ +package nu.marginalia.util.array.algo; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; + +public interface LongArrayBase extends BulkTransferArray { + long get(long pos); + + void set(long pos, long value); + + long size(); + + default void fill(long start, long end, long val) { + for (long v = start; v < end; v++) { + set(v, val); + } + } + + default void increment(long pos) { + set(pos, get(pos) + 1); + } + + default void swap(long pos1, long pos2) { + long tmp = get(pos1); + set(pos1, get(pos2)); + set(pos2, tmp); + } + + default void swapn(int n, long pos1, long pos2) { + for (int i = 0; i < n; i++) { + long tmp = get(pos1+i); + set(pos1+i, get(pos2+i)); + set(pos2+i, tmp); + } + } + + default long getAndIncrement(long pos) { + long val = get(pos); + set(pos, val + 1); + return val; + } + + default void set(long start, long end, LongBuffer buffer, int bufferStart) { + for (int i = 0; i < (end-start); i++) { + set(start+i, buffer.get(i + bufferStart)); + } + } + default void get(long start, long end, LongBuffer buffer, int bufferStart) { + for (int i = 0; i < (end-start); i++) { + buffer.put(i + bufferStart, get(start + i)); + } + } + + default void get(long start, LongBuffer buffer) { + get(start, start + buffer.remaining(), buffer, buffer.position()); + } + + default void get(long start, long end, long[] buffer) { + for (long i = 0; i < (end-start); i++) { + buffer[(int) i] = get(start + i); + } + } + + void write(Path file) throws IOException; + + void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySearch.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySearch.java new file mode 100644 index 00000000..2f2579b2 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySearch.java @@ -0,0 +1,263 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.buffer.LongQueryBuffer; + +public interface LongArraySearch extends LongArrayBase { + + int LINEAR_SEARCH_CUTOFF = 32; + + default long linearSearch(long key, long fromIndex, long toIndex) { + long pos; + + for (pos = fromIndex; pos < toIndex; pos++) { + long val = get(pos); + + if (val == key) return pos; + if (val > key) break; + } + + return encodeSearchMiss(pos - 1); + } + + default long linearSearchUpperBound(long key, long fromIndex, long toIndex) { + + for (long pos = fromIndex; pos < toIndex; pos++) { + if (get(pos) >= key) return pos; + } + + return toIndex; + } + + default long linearSearchN(int sz, long key, long fromIndex, long toIndex) { + long pos; + + for (pos = fromIndex; pos < toIndex; pos+=sz) { + long val = get(pos); + + if (val == key) return pos; + if (val > key) return encodeSearchMiss(pos); + } + + return encodeSearchMiss(toIndex - sz); + } + + default long binarySearch(long key, long fromIndex, long toIndex) { + long low = 0; + long high = (toIndex - fromIndex) - 1; + + while (high - low >= LINEAR_SEARCH_CUTOFF) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + + return linearSearch(key, fromIndex + low, fromIndex + high + 1); + } + + default long binarySearchN(int sz, long key, long fromIndex, long toIndex) { + long low = 0; + long high = (toIndex - fromIndex)/sz - 1; + + while (high - low >= LINEAR_SEARCH_CUTOFF) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + sz*mid); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + sz*mid; + } + + for (fromIndex += low*sz; fromIndex < toIndex; fromIndex+=sz) { + long val = get(fromIndex); + + if (val == key) return fromIndex; + if (val > key) return encodeSearchMiss(fromIndex); + } + + return encodeSearchMiss(toIndex - sz); + } + + + default long binarySearchUpperBound(long key, long fromIndex, long toIndex) { + long low = 0; + long high = (toIndex - fromIndex) - 1; + + while (high - low >= LINEAR_SEARCH_CUTOFF) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + mid); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + mid; + } + + for (fromIndex += low; fromIndex < toIndex; fromIndex++) { + if (get(fromIndex) >= key) return fromIndex; + } + + return toIndex; + } + + default long binarySearchUpperBoundN(int sz, long key, long fromIndex, long toIndex) { + long low = 0; + long high = (toIndex - fromIndex)/sz - 1; + + while (high - low >= LINEAR_SEARCH_CUTOFF) { + long mid = (low + high) >>> 1; + long midVal = get(fromIndex + sz*mid); + + if (midVal < key) + low = mid + 1; + else if (midVal > key) + high = mid - 1; + else + return fromIndex + sz*mid; + } + + for (fromIndex += low; fromIndex < toIndex; fromIndex+=sz) { + if (get(fromIndex) >= key) return fromIndex; + } + + return toIndex; + } + + default void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + + if (searchStart >= searchEnd) return; + + long bv = buffer.currentValue(); + long av = get(searchStart); + long pos = searchStart; + + while (bv <= boundary && buffer.hasMore()) { + if (bv < av) { + if (!buffer.rejectAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + else if (bv == av) { + if (!buffer.retainAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + + if (++pos < searchEnd) { + av = get(pos); + } + else { + break; + } + } + } + + default void retainN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) { + + if (searchStart >= searchEnd) return; + + long bv = buffer.currentValue(); + long av = get(searchStart); + long pos = searchStart; + + while (bv <= boundary && buffer.hasMore()) { + if (bv < av) { + if (!buffer.rejectAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + else if (bv == av) { + if (!buffer.retainAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + + pos += sz; + + if (pos < searchEnd) { + av = get(pos); + } + else { + break; + } + } + } + default void reject(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + + if (searchStart >= searchEnd) return; + + long bv = buffer.currentValue(); + long av = get(searchStart); + long pos = searchStart; + + while (bv <= boundary && buffer.hasMore()) { + if (bv < av) { + if (!buffer.retainAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + else if (bv == av) { + if (!buffer.rejectAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + + if (++pos < searchEnd) { + av = get(pos); + } + else { + break; + } + } + + } + + default void rejectN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) { + + if (searchStart >= searchEnd) return; + + long bv = buffer.currentValue(); + long av = get(searchStart); + long pos = searchStart; + + while (bv <= boundary && buffer.hasMore()) { + if (bv < av) { + if (!buffer.retainAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + else if (bv == av) { + if (!buffer.rejectAndAdvance()) break; + bv = buffer.currentValue(); + continue; + } + + pos += sz; + if (pos < searchEnd) { + av = get(pos); + } + else { + break; + } + } + + } + + static long encodeSearchMiss(long value) { + return -1 - value; + } + + static long decodeSearchMiss(long value) { + return -value - 1; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySort.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySort.java new file mode 100644 index 00000000..5c1fb10f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArraySort.java @@ -0,0 +1,325 @@ +package nu.marginalia.util.array.algo; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public interface LongArraySort extends LongArrayBase { + + default boolean isSorted(long start, long end) { + if (start == end) return true; + + long val = get(start); + for (long i = start + 1; i < end; i++) { + long next = get(i); + if (next < val) + return false; + val = next; + } + + return true; + } + + default void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException { + long size = end - start; + + if (size < ctx.memorySortLimit()) { + quickSort(start, end); + } + else { + mergeSort(start, end, ctx.tempDir()); + } + } + + default void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException { + if (sz == 1) { + sortLargeSpan(ctx, start, end); + return; + } + + long size = end - start; + + if (size < ctx.memorySortLimit()) { + quickSortN(sz, start, end); + } + else { + mergeSortN(sz, start, end, ctx.tempDir()); + } + } + + default boolean isSortedN(int wordSize, long start, long end) { + if (start == end) return true; + + long val = get(start); + for (long i = start + wordSize; i < end; i+=wordSize) { + long next = get(i); + if (next < val) + return false; + val = next; + } + + return true; + } + + + + default void insertionSort(long start, long end) { + assert end - start < Integer.MAX_VALUE; + + int n = (int) (end - start); + + if (n <= 1) { + return; + } + + for (int i = 1; i < n; i++) { + long key = get(start + i); + + int j = i - 1; + while (j >= 0 && get(start + j) > key) { + swap( start + j, start + (long)(j+1)); + j--; + } + set(start + j+1, key); + } + } + + default void insertionSortN(int sz, long start, long end) { + assert end - start < Integer.MAX_VALUE; + + int span = (int) (end - start); + + assert (span % sz) == 0; + + if (span <= sz) { + return; + } + + for (int i = 1; i < span / sz; i++) { + long key = get(start + (long) i * sz); + + int j = i - 1; + while (j >= 0 && get(start + (long)sz*j) > key) { + swapn(sz, start + (long)sz*j, start + (long)sz*(j+1)); + j--; + } + set(start + (long) (j+1) * sz, key); + } + } + + + default void quickSort(long start, long end) { + if (end - start < 64) { + insertionSort(start, end); + } + else { + _quickSortLH(start, end - 1); + } + } + + default void quickSortN(int wordSize, long start, long end) { + assert ((end - start) % wordSize) == 0; + + if (end == start) + return; + + _quickSortLHN(wordSize, start, end - wordSize); + } + + default void _quickSortLHN(int wordSize, long low, long highInclusive) { + if (low < 0 || highInclusive < 0 || low >= highInclusive) + return; + + if (highInclusive - low < 32L*wordSize) { + insertionSortN(wordSize, low, highInclusive + wordSize); + return; + } + + long p = _quickSortPartitionN(wordSize, low, highInclusive); + + _quickSortLHN(wordSize, low, p); + _quickSortLHN(wordSize, p + wordSize, highInclusive); + } + + + default void _quickSortLH(long low, long highInclusive) { + + if (low < 0 || highInclusive < 0 || low >= highInclusive) + return; + + if (highInclusive - low < 32) { + insertionSort(low, highInclusive + 1); + return; + } + + long p = _quickSortPartition(low, highInclusive); + + _quickSortLH(low, p); + _quickSortLH(p + 1, highInclusive); + } + + + default long _quickSortPartition(long low, long high) { + + long pivotPoint = ((low + high) / (2L)); + long pivot = get(pivotPoint); + + long i = low - 1; + long j = high + 1; + + for (;;) { + do { + i+=1; + } while (get(i) < pivot); + + do { + j-=1; + } + while (get(j) > pivot); + + if (i >= j) return j; + else swap(i, j); + } + } + + default long _quickSortPartitionN(int wordSize, long low, long high) { + + long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize; + long pivot = get(pivotPoint); + + long i = low - wordSize; + long j = high + wordSize; + + for (;;) { + do { + i+=wordSize; + } + while (get(i) < pivot); + + do { + j-=wordSize; + } + while (get(j) > pivot); + + if (i >= j) return j; + else swapn(wordSize, i, j); + } + } + + default void _mergeSortN(int wordSize, long start, int length, LongBuffer workBuffer) throws IOException { + int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(workBuffer.capacity())); + + // Do in-memory sorting up until internalSortLimit first + for (int i = 0; i < length; i += width) { + quickSortN(wordSize, start + i, start + i + Math.min(width, length-i)); + } + + // Then finish with merge sort + for (; width < length; width*=2) { + + for (int i = 0; i < length; i += 2*width) { + _mergeN(wordSize, start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer); + } + + workBuffer.clear(); + set(start, start + length, workBuffer, 0); + } + + } + + default void mergeSortN(int wordSize, long start, long end, Path tmpDir) throws IOException { + int length = (int) (end - start); + assert (length % wordSize) == 0; + + Path tmpFile = Files.createTempFile(tmpDir,"sort-"+start+"-"+(start+length), ".dat"); + try (var channel = (FileChannel) Files.newByteChannel(tmpFile, StandardOpenOption.WRITE, StandardOpenOption.READ)) { + var workBuffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, 8L * length).asLongBuffer(); + + _mergeSortN(wordSize, start, length, workBuffer); + } + finally { + Files.delete(tmpFile); + } + } + + + default void mergeSort(long start, long end, Path tmpDir) throws IOException { + int length = (int) (end - start); + + Path tmpFile = Files.createTempFile(tmpDir,"sort-"+start+"-"+(start+length), ".dat"); + try (var channel = (FileChannel) Files.newByteChannel(tmpFile, StandardOpenOption.WRITE, StandardOpenOption.READ)) { + var workBuffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, 8L * length).asLongBuffer(); + + _mergeSort(start, length, workBuffer); + } + finally { + Files.delete(tmpFile); + } + + } + + default void _mergeSort(long start, int length, LongBuffer workBuffer) { + int width = Math.min(Integer.highestOneBit(length), 1 << 16); + + // Do in-memory sorting up until internalSortLimit first + for (int i = 0; i < length; i += width) { + quickSort(start + i, start + i + Math.min(width, length-i)); + } + + // Then finish with merge sort + for (width = 1; width < length; width*=2) { + + for (int i = 0; i < length; i += 2*width) { + _merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer); + } + + workBuffer.clear(); + set(start, start + length, workBuffer, 0); + } + + } + + + default void _mergeN(int wordSize, long offset, int left, int right, int end, LongBuffer workBuffer) { + long idxL = left; + long idxR = right; + + for (int putPos = left; putPos < end; putPos+= wordSize) { + + if (idxL < right && (idxR >= end || get(offset+idxL) < get(offset+idxR))) { + workBuffer.put(putPos, get(offset+idxL)); + for (int s = 1; s < wordSize; s++) { + workBuffer.put(putPos + s, get(offset + idxL + s)); + } + idxL+= wordSize; + } + else { + workBuffer.put(putPos, get(offset+idxR)); + for (int s = 1; s < wordSize; s++) { + workBuffer.put(putPos + s, get(offset + idxR + s)); + } + idxR+= wordSize; + } + } + } + + + default void _merge(long offset, int left, int right, int end, LongBuffer workBuffer) { + long idxL = left; + long idxR = right; + + for (int putPos = left; putPos < end; putPos++) { + if (idxL < right && (idxR >= end || get(offset+idxL) < get(offset+idxR))) { + workBuffer.put(putPos, get(offset+idxL)); + idxL++; + } + else { + workBuffer.put(putPos, get(offset+idxR)); + idxR++; + } + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayTransformations.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayTransformations.java new file mode 100644 index 00000000..3ff4b82f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayTransformations.java @@ -0,0 +1,40 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.functional.LongBinaryIOOperation; +import nu.marginalia.util.array.functional.LongIOTransformer; +import nu.marginalia.util.array.functional.LongLongConsumer; +import nu.marginalia.util.array.functional.LongTransformer; + +import java.io.IOException; + +public interface LongArrayTransformations extends LongArrayBase { + + default void forEach(long start, long end, LongLongConsumer consumer) { + for (long i = start; i < end; i++) { + consumer.accept(i, get(i)); + } + } + + default void transformEach(long start, long end, LongTransformer transformer) { + for (long i = start; i < end; i++) { + set(i, transformer.transform(i, get(i))); + } + } + + default void transformEachIO(long start, long end, LongIOTransformer transformer) throws IOException { + for (long i = start; i < end; i++) { + set(i, transformer.transform(i, get(i))); + } + } + + default long foldIO(long zero, long start, long end, LongBinaryIOOperation operator) throws IOException { + long accumulator = zero; + + for (long i = start; i < end; i++) { + accumulator = operator.apply(accumulator, get(i)); + } + + return accumulator; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/SortingContext.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/SortingContext.java new file mode 100644 index 00000000..0bd436fb --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/SortingContext.java @@ -0,0 +1,6 @@ +package nu.marginalia.util.array.algo; + +import java.nio.file.Path; + +public record SortingContext(Path tempDir, int memorySortLimit) { +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/IntQueryBuffer.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/IntQueryBuffer.java new file mode 100644 index 00000000..75d829cd --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/IntQueryBuffer.java @@ -0,0 +1,112 @@ +package nu.marginalia.util.array.buffer; + +import java.util.Arrays; + +public class IntQueryBuffer { + public final int[] data; + public int end; + + private int read = 0; + private int write = 0; + + public IntQueryBuffer(int size) { + this.data = new int[size]; + this.end = size; + } + + public IntQueryBuffer(int [] data, int size) { + this.data = data; + this.end = size; + } + + public int[] copyData() { + return Arrays.copyOf(data, end); + } + + public boolean isEmpty() { + return end == 0; + } + + public int size() { + return end; + } + + public int currentValue() { + return data[read]; + } + + public boolean rejectAndAdvance() { + return ++read < end; + } + + public boolean retainAndAdvance() { + if (read != write) { + int tmp = data[write]; + data[write] = data[read]; + data[read] = tmp; + } + + write++; + + return ++read < end; + } + + public boolean hasMore() { + return read < end; + } + + public void finalizeFiltering() { + end = write; + read = 0; + write = 0; + } + + public void startFilterForRange(int pos, int end) { + read = write = pos; + this.end = end; + } + + public void reset() { + end = data.length; + read = 0; + write = 0; + } + + public void zero() { + end = 0; + read = 0; + write = 0; + Arrays.fill(data, 0); + } + + public void uniq() { + if (end <= 1) return; + + int prev = currentValue(); + retainAndAdvance(); + + while (hasMore()) { + + int val = currentValue(); + + if (prev == val) { + rejectAndAdvance(); + } else { + retainAndAdvance(); + prev = val; + } + + } + + finalizeFiltering(); + } + + public String toString() { + return getClass().getSimpleName() + "[" + + "read = " + read + + ",write = " + write + + ",end = " + end + + ",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]"; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeQueryBuffer.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/LongQueryBuffer.java similarity index 63% rename from marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeQueryBuffer.java rename to marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/LongQueryBuffer.java index 3553a97b..6bea19ff 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeQueryBuffer.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/buffer/LongQueryBuffer.java @@ -1,62 +1,32 @@ -package nu.marginalia.util.btree; +package nu.marginalia.util.array.buffer; import java.util.Arrays; -public class BTreeQueryBuffer { +public class LongQueryBuffer { public final long[] data; public int end; private int read = 0; private int write = 0; - public BTreeQueryBuffer(int size) { + public LongQueryBuffer(int size) { this.data = new long[size]; this.end = size; } - public BTreeQueryBuffer(long [] data, int size) { + public LongQueryBuffer(long [] data, int size) { this.data = data; this.end = size; } - private BTreeQueryBuffer(long [] data) { - this.data = data; - this.end = data.length; - } - - public BTreeQueryBuffer[] split(int... splitPoints) { - BTreeQueryBuffer[] ret = new BTreeQueryBuffer[splitPoints.length+1]; - - ret[0] = new BTreeQueryBuffer(Arrays.copyOfRange(data, 0, splitPoints[0])); - for (int i = 1; i < splitPoints.length; i++) { - ret[i] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[i-1], splitPoints[i])); - } - ret[ret.length-1] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[splitPoints.length-1], end)); - - return ret; - } - - public void gather(BTreeQueryBuffer... buffers) { - int start = 0; - - for (var buffer : buffers) { - System.arraycopy(buffer.data, 0, data, start, buffer.end); - start += buffer.end; - } - - this.read = 0; - this.write = 0; - this.end = start; + public boolean hasRetainedData() { + return write > 0; } public long[] copyData() { return Arrays.copyOf(data, end); } - public void retainAll() { - read = write = end; - } - public boolean isEmpty() { return end == 0; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplIntArrayDelegate.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplIntArrayDelegate.java new file mode 100644 index 00000000..bdb4eccb --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplIntArrayDelegate.java @@ -0,0 +1,58 @@ +package nu.marginalia.util.array.delegate; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.IntArray; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Path; + +public class ReferenceImplIntArrayDelegate implements IntArray { + + private final IntArray delegate; + + public ReferenceImplIntArrayDelegate(IntArray delegate) { + this.delegate = delegate; + } + + @Override + public int get(long pos) { + return delegate.get(pos); + } + + @Override + public void set(long pos, int value) { + delegate.set(pos, value); + } + + @Override + public long size() { + return delegate.size(); + } + + @Override + public void write(Path file) throws IOException { + delegate.write(file); + } + + @Override + public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException { + delegate.transferFrom(source, sourceStart, arrayStart, arrayEnd); + } + + @Override + public void force() { + delegate.force(); + } + + + @Override + public void advice(NativeIO.Advice advice) throws IOException { + delegate.advice(advice); + } + + @Override + public void advice(NativeIO.Advice advice, long start, long end) throws IOException { + delegate.advice(advice, start, end); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplLongArrayDelegate.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplLongArrayDelegate.java new file mode 100644 index 00000000..f33f565c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ReferenceImplLongArrayDelegate.java @@ -0,0 +1,58 @@ +package nu.marginalia.util.array.delegate; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.LongArray; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Path; + +public class ReferenceImplLongArrayDelegate implements LongArray { + + private final LongArray delegate; + + public ReferenceImplLongArrayDelegate(LongArray delegate) { + this.delegate = delegate; + } + + @Override + public long get(long pos) { + return delegate.get(pos); + } + + @Override + public void set(long pos, long value) { + delegate.set(pos, value); + } + + @Override + public long size() { + return delegate.size(); + } + + @Override + public void write(Path file) throws IOException { + delegate.write(file); + } + + @Override + public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException { + delegate.transferFrom(source, sourceStart, arrayStart, arrayEnd); + } + + @Override + public void force() { + delegate.force(); + } + + + @Override + public void advice(NativeIO.Advice advice) throws IOException { + delegate.advice(advice); + } + + @Override + public void advice(NativeIO.Advice advice, long start, long end) throws IOException { + delegate.advice(advice, start, end); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java new file mode 100644 index 00000000..b7dc343e --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java @@ -0,0 +1,199 @@ +package nu.marginalia.util.array.delegate; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.IntArray; +import nu.marginalia.util.array.buffer.IntQueryBuffer; +import nu.marginalia.util.array.functional.IntBinaryIOOperation; +import nu.marginalia.util.array.functional.IntIOTransformer; +import nu.marginalia.util.array.functional.IntTransformer; +import nu.marginalia.util.array.functional.LongIntConsumer; + +import java.io.IOException; +import java.nio.IntBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; + +public class ShiftedIntArray implements IntArray { + public final long shift; + public final long size; + + private final IntArray delegate; + + public ShiftedIntArray(long shift, IntArray delegate) { + this.shift = shift; + this.size = delegate.size() - shift; + this.delegate = delegate; + } + + public ShiftedIntArray(long start, long end, IntArray delegate) { + this.shift = start; + this.size = end - start; + this.delegate = delegate; + } + + @Override + public int get(long pos) { + return delegate.get(pos+shift); + } + + @Override + public void set(long pos, int value) { + delegate.set(pos+shift, value); + } + + @Override + public void set(long start, long end, IntBuffer buffer, int bufferStart) { + delegate.set(shift + start, shift + end, buffer, bufferStart); + } + + @Override + public void get(long start, long end, IntBuffer buffer, int bufferStart) { + delegate.get(shift + start, shift + end, buffer, bufferStart); + } + + @Override + public void get(long start, IntBuffer buffer) { + delegate.get(shift + start, buffer); + } + + @Override + public void get(long start, long end, int[] buffer) { + delegate.get(shift+start, shift+end, buffer); + } + + @Override + public long size() { + return size; + } + + @Override + public void write(Path file) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public ShiftedIntArray shifted(long offset) { + return new ShiftedIntArray(shift+offset, delegate); + } + + @Override + public ShiftedIntArray range(long start, long end) { + return new ShiftedIntArray(shift + start, shift+end, delegate); + } + + public int[] toArray() { + int[] ret = new int[(int) size]; + for (int i = 0; i < size; i++) { + ret[i] = delegate.get(shift + i); + } + return ret; + } + + public boolean isSorted() { + return isSorted(0, size); + } + + public boolean isSorted(long start, long end) { + return delegate.isSorted(shift + start, shift + end); + } + + public long search(int key) { + if (size < 128) { + return linearSearch(key); + } + else { + return binarySearch(key); + } + } + + public long linearSearch(int key) { + return linearSearch(key, 0, size); + } + + public long binarySearch(int key) { + return binarySearch(key, 0, size); + } + + public long binarySearchUpperbound(int key) { + return binarySearchUpperBound(key, 0, size); + } + + public void retain(IntQueryBuffer buffer, long boundary) { + retain(buffer, boundary, 0, size); + } + + public void reject(IntQueryBuffer buffer, long boundary) { + reject(buffer, boundary, 0, size); + } + + @Override + public long linearSearch(int key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.linearSearch(key, fromIndex + shift, toIndex+shift)); + } + + @Override + public long binarySearch(int key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.binarySearch(key, fromIndex + shift, toIndex+shift)); + } + + @Override + public long binarySearchUpperBound(int key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.binarySearchUpperBound(key, fromIndex + shift, toIndex+shift)); + } + + private long translateSearchResult(long ret) { + if (ret > 0) return ret - shift; + return ret + shift; + } + + @Override + public void retain(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + delegate.retain(buffer, boundary, searchStart + shift, searchEnd + shift); + } + + @Override + public void reject(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + delegate.reject(buffer, boundary, searchStart + shift, searchEnd + shift); + } + + @Override + public void forEach(long start, long end, LongIntConsumer consumer) { + delegate.forEach(start + shift, end+shift, (pos, old) -> consumer.accept(pos-shift, old)); + } + + @Override + public void transformEach(long start, long end, IntTransformer transformer) { + delegate.transformEach(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old)); + } + + @Override + public void transformEachIO(long start, long end, IntIOTransformer transformer) throws IOException { + delegate.transformEachIO(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old)); + } + + @Override + public int foldIO(int zero, long start, long end, IntBinaryIOOperation operator) throws IOException { + return delegate.foldIO(zero, start + shift, end+shift, operator); + } + + @Override + public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException { + delegate.transferFrom(source, sourceStart, shift + arrayStart, shift + arrayEnd); + } + + @Override + public void force() { + delegate.force(); + } + + @Override + public void advice(NativeIO.Advice advice) throws IOException { + delegate.advice(advice, shift, shift + size()); + } + + @Override + public void advice(NativeIO.Advice advice, long start, long end) throws IOException { + delegate.advice(advice, start + shift, end + shift); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java new file mode 100644 index 00000000..cc6386d6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java @@ -0,0 +1,255 @@ +package nu.marginalia.util.array.delegate; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.algo.LongArraySearch; +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.array.functional.LongBinaryIOOperation; +import nu.marginalia.util.array.functional.LongIOTransformer; +import nu.marginalia.util.array.functional.LongLongConsumer; +import nu.marginalia.util.array.functional.LongTransformer; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; + +public class ShiftedLongArray implements LongArray { + public final long shift; + public final long size; + private final LongArray delegate; + + public ShiftedLongArray(long shift, LongArray delegate) { + this.shift = shift; + this.size = delegate.size() - shift; + this.delegate = delegate; + } + + public ShiftedLongArray(long start, long end, LongArray delegate) { + this.shift = start; + this.size = end - start; + this.delegate = delegate; + } + + + @Override + public long get(long pos) { + return delegate.get(pos+shift); + } + + @Override + public void set(long pos, long value) { + delegate.set(pos+shift, value); + } + + @Override + public void set(long start, long end, LongBuffer buffer, int bufferStart) { + delegate.set(shift + start, shift + end, buffer, bufferStart); + } + + @Override + public void get(long start, long end, LongBuffer buffer, int bufferStart) { + delegate.get(shift + start, shift + end, buffer, bufferStart); + } + + @Override + public void get(long start, LongBuffer buffer) { + delegate.get(shift + start, buffer); + } + + @Override + public void get(long start, long end, long[] buffer) { + delegate.get(shift+start, shift+end, buffer); + } + + @Override + public long size() { + return size; + } + + @Override + public void write(Path file) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public ShiftedLongArray shifted(long offset) { + return new ShiftedLongArray(shift+offset, delegate); + } + + @Override + public ShiftedLongArray range(long start, long end) { + return new ShiftedLongArray(shift + start, shift+end, delegate); + } + + public long[] toArray() { + long[] ret = new long[(int) size]; + for (int i = 0; i < size; i++) { + ret[i] = delegate.get(shift + i); + } + return ret; + } + + public boolean isSorted() { + return isSorted(0, size); + } + + public boolean isSortedN(int sz) { + return isSortedN(sz, 0, size); + } + + public boolean isSorted(long start, long end) { + return delegate.isSorted(shift + start, shift + end); + } + + public boolean isSortedN(int sz, long start, long end) { + return delegate.isSortedN(sz, shift + start, shift + end); + } + + public long searchN(int sz, long key) { + if (size < 128) { + return linearSearchN(sz, key); + } + else { + return binarySearchN(sz, key); + } + } + + public long search(long key) { + if (size < 128) { + return linearSearch(key); + } + else { + return binarySearch(key); + } + } + + public long linearSearch(long key) { + return linearSearch(key, 0, size); + } + + public long binarySearch(long key) { + return binarySearch(key, 0, size); + } + + public long binarySearchN(int sz, long key) { + return binarySearchN(sz, key, 0, size); + } + + public long linearSearchN(int sz, long key) { + return linearSearchN(sz, key, 0, size); + } + + public void retain(LongQueryBuffer buffer, long boundary) { + retain(buffer, boundary, 0, size); + } + public void retainN(LongQueryBuffer buffer, int sz, long boundary) { + if (sz == 1) + retain(buffer, boundary, 0, size); + else + retainN(buffer, sz, boundary, 0, size); + } + + public void reject(LongQueryBuffer buffer, long boundary) { + reject(buffer, boundary, 0, size); + } + + public void rejectN(LongQueryBuffer buffer, int sz, long boundary) { + if (sz == 1) + reject(buffer, boundary, 0, size); + else + rejectN(buffer, sz, boundary, 0, size); + + } + + @Override + public long linearSearch(long key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.linearSearch(key, fromIndex + shift, toIndex+shift)); + } + @Override + public long linearSearchN(int sz, long key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.linearSearch(key, fromIndex + shift, toIndex+shift)); + } + @Override + public long binarySearch(long key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.binarySearch(key, fromIndex + shift, toIndex+shift)); + } + @Override + public long binarySearchN(int sz, long key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.binarySearchN(sz, key, fromIndex + shift, toIndex+shift)); + } + @Override + public long binarySearchUpperBound(long key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.binarySearchUpperBound(key, fromIndex + shift, toIndex+shift)); + } + @Override + public long linearSearchUpperBound(long key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.linearSearchUpperBound(key, fromIndex + shift, toIndex+shift)); + } + @Override + public long binarySearchUpperBoundN(int sz, long key, long fromIndex, long toIndex) { + return translateSearchResult(delegate.binarySearchUpperBoundN(sz, key, fromIndex + shift, toIndex+shift)); + } + private long translateSearchResult(long delegatedIdx) { + long ret; + + if (delegatedIdx >= 0) ret = delegatedIdx - shift; + else ret = LongArraySearch.encodeSearchMiss(Math.max(0, LongArraySearch.decodeSearchMiss(delegatedIdx) - shift)); + + return ret; + } + + public void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + delegate.retain(buffer, boundary, searchStart + shift, searchEnd + shift); + } + public void retainN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) { + delegate.retainN(buffer, sz, boundary, searchStart + shift, searchEnd + shift); + } + public void reject(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + delegate.reject(buffer, boundary, searchStart + shift, searchEnd + shift); + } + public void rejectN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) { + delegate.rejectN(buffer, sz, boundary, searchStart + shift, searchEnd + shift); + } + + @Override + public void forEach(long start, long end, LongLongConsumer consumer) { + delegate.forEach(start + shift, end+shift, (pos, old) -> consumer.accept(pos-shift, old)); + } + + @Override + public void transformEach(long start, long end, LongTransformer transformer) { + delegate.transformEach(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old)); + } + + @Override + public void transformEachIO(long start, long end, LongIOTransformer transformer) throws IOException { + delegate.transformEachIO(start + shift, end+shift, (pos, old) -> transformer.transform(pos-shift, old)); + } + + @Override + public long foldIO(long zero, long start, long end, LongBinaryIOOperation operator) throws IOException { + return delegate.foldIO(zero, start + shift, end+shift, operator); + } + + @Override + public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException { + delegate.transferFrom(source, sourceStart, shift + arrayStart, shift + arrayEnd); + } + + @Override + public void force() { + delegate.force(); + } + + @Override + public void advice(NativeIO.Advice advice) throws IOException { + delegate.advice(advice, shift, shift + size()); + } + + @Override + public void advice(NativeIO.Advice advice, long start, long end) throws IOException { + delegate.advice(advice, start + shift, end + shift); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCall.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCall.java new file mode 100644 index 00000000..5f96462d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCall.java @@ -0,0 +1,5 @@ +package nu.marginalia.util.array.functional; + +public interface AddressRangeCall { + void apply(T array, int start, int end); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCallIO.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCallIO.java new file mode 100644 index 00000000..a7fa2867 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeCallIO.java @@ -0,0 +1,7 @@ +package nu.marginalia.util.array.functional; + +import java.io.IOException; + +public interface AddressRangeCallIO { + void apply(T array, int start, int end) throws IOException; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeIntFunction.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeIntFunction.java new file mode 100644 index 00000000..93b3b58f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeIntFunction.java @@ -0,0 +1,5 @@ +package nu.marginalia.util.array.functional; + +public interface AddressRangeIntFunction { + int apply(T array, int start, int end); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeLongFunction.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeLongFunction.java new file mode 100644 index 00000000..ef214419 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/AddressRangeLongFunction.java @@ -0,0 +1,5 @@ +package nu.marginalia.util.array.functional; + +public interface AddressRangeLongFunction { + long apply(T array, int start, int end); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntBinaryIOOperation.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntBinaryIOOperation.java new file mode 100644 index 00000000..6761f633 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntBinaryIOOperation.java @@ -0,0 +1,7 @@ +package nu.marginalia.util.array.functional; + +import java.io.IOException; + +public interface IntBinaryIOOperation { + int apply(int left, int right) throws IOException; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntIOTransformer.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntIOTransformer.java new file mode 100644 index 00000000..96f84477 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntIOTransformer.java @@ -0,0 +1,7 @@ +package nu.marginalia.util.array.functional; + +import java.io.IOException; + +public interface IntIOTransformer { + int transform(long pos, int old) throws IOException; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntTransformer.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntTransformer.java new file mode 100644 index 00000000..c1ba44e6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/IntTransformer.java @@ -0,0 +1,5 @@ +package nu.marginalia.util.array.functional; + +public interface IntTransformer { + int transform(long pos, int old); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongBinaryIOOperation.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongBinaryIOOperation.java new file mode 100644 index 00000000..c097c016 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongBinaryIOOperation.java @@ -0,0 +1,7 @@ +package nu.marginalia.util.array.functional; + +import java.io.IOException; + +public interface LongBinaryIOOperation { + long apply(long left, long right) throws IOException; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIOTransformer.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIOTransformer.java new file mode 100644 index 00000000..997bcfd8 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIOTransformer.java @@ -0,0 +1,7 @@ +package nu.marginalia.util.array.functional; + +import java.io.IOException; + +public interface LongIOTransformer { + long transform(long pos, long old) throws IOException; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIntConsumer.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIntConsumer.java new file mode 100644 index 00000000..781ebe9e --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongIntConsumer.java @@ -0,0 +1,5 @@ +package nu.marginalia.util.array.functional; + +public interface LongIntConsumer { + void accept(long pos, int val); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongLongConsumer.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongLongConsumer.java new file mode 100644 index 00000000..6390d59e --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongLongConsumer.java @@ -0,0 +1,5 @@ +package nu.marginalia.util.array.functional; + +public interface LongLongConsumer { + void accept(long pos, long val); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongTransformer.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongTransformer.java new file mode 100644 index 00000000..4f998646 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functional/LongTransformer.java @@ -0,0 +1,5 @@ +package nu.marginalia.util.array.functional; + +public interface LongTransformer { + long transform(long pos, long old); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/IntIOFolder.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/IntIOFolder.java new file mode 100644 index 00000000..b3ee83ce --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/IntIOFolder.java @@ -0,0 +1,21 @@ +package nu.marginalia.util.array.functor; + +import nu.marginalia.util.array.functional.AddressRangeCallIO; +import nu.marginalia.util.array.functional.IntBinaryIOOperation; +import nu.marginalia.util.array.page.IntArrayPage; + +import java.io.IOException; + +public class IntIOFolder implements AddressRangeCallIO { + public int acc; + private final IntBinaryIOOperation operator; + + public IntIOFolder(int zero, IntBinaryIOOperation operator) { + this.acc = zero; + this.operator = operator; + } + + public void apply(IntArrayPage array, int start, int end) throws IOException { + acc = array.foldIO(acc, start, end, operator); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/LongIOFolder.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/LongIOFolder.java new file mode 100644 index 00000000..ce9e796f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/functor/LongIOFolder.java @@ -0,0 +1,21 @@ +package nu.marginalia.util.array.functor; + +import nu.marginalia.util.array.functional.AddressRangeCallIO; +import nu.marginalia.util.array.functional.LongBinaryIOOperation; +import nu.marginalia.util.array.page.LongArrayPage; + +import java.io.IOException; + +public class LongIOFolder implements AddressRangeCallIO { + public long acc; + private final LongBinaryIOOperation operator; + + public LongIOFolder(long zero, LongBinaryIOOperation operator) { + this.acc = zero; + this.operator = operator; + } + + public void apply(LongArrayPage array, int start, int end) throws IOException { + acc = array.foldIO(acc, start, end, operator); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java new file mode 100644 index 00000000..43a48c16 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java @@ -0,0 +1,88 @@ +package nu.marginalia.util.array.page; + +import nu.marginalia.util.array.algo.BulkTransferArray; +import nu.marginalia.util.array.functional.AddressRangeCall; +import nu.marginalia.util.array.functional.AddressRangeCallIO; +import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; + +import java.io.IOException; + +import static nu.marginalia.util.array.algo.LongArraySearch.decodeSearchMiss; +import static nu.marginalia.util.array.algo.LongArraySearch.encodeSearchMiss; + +public class AbstractPagingArray, B> { + final T[] pages; + final long size; + final ArrayPartitioningScheme partitioningScheme; + + public AbstractPagingArray(ArrayPartitioningScheme partitioningScheme, T[] pages, long size) { + this.partitioningScheme = partitioningScheme; + this.pages = pages; + this.size = size; + } + + void delegateToEachPage(long start, long end, AddressRangeCall fn) { + assert end >= start; + + int page = partitioningScheme.getPage(start); + + long endPos; + + for (long pos = start; pos < end; pos = endPos) { + endPos = partitioningScheme.getPageEnd(pos, end); + + int sOff = partitioningScheme.getOffset(pos); + int eOff = partitioningScheme.getEndOffset(start, endPos); + + fn.apply(pages[page++], sOff, eOff); + } + } + + void delegateToEachPageIO(long start, long end, AddressRangeCallIO fn) throws IOException { + assert end >= start; + + int page = partitioningScheme.getPage(start); + + long endPos; + + for (long pos = start; pos < end; pos = endPos) { + endPos = partitioningScheme.getPageEnd(pos, end); + + int sOff = partitioningScheme.getOffset(pos); + int eOff = partitioningScheme.getEndOffset(start, endPos); + + fn.apply(pages[page++], sOff, eOff); + } + } + + long translateSearchResultsFromPage(long fromIndex, long ret) { + int page = partitioningScheme.getPage(fromIndex); + + if (ret >= 0) { + return partitioningScheme.toRealIndex(page, (int) ret); + } else { + ret = decodeSearchMiss(ret); + ret = partitioningScheme.toRealIndex(page, (int) ret); + return encodeSearchMiss(ret); + } + } + + public void set(long start, long end, B buffer, int bufferStart) { + assert end >= start; + + int page = partitioningScheme.getPage(start); + + long endPos; + + for (long pos = start; pos < end; pos = endPos) { + endPos = partitioningScheme.getPageEnd(pos, end); + + int sOff = partitioningScheme.getOffset(pos); + int eOff = partitioningScheme.getEndOffset(start, endPos); + + pages[page++].set(sOff, eOff, buffer, bufferStart); + + bufferStart += eOff - sOff; + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/IntArrayPage.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/IntArrayPage.java new file mode 100644 index 00000000..b2270c8c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/IntArrayPage.java @@ -0,0 +1,120 @@ +package nu.marginalia.util.array.page; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.IntArray; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.IntBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.OpenOption; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class IntArrayPage implements PartitionPage, IntArray { + + final IntBuffer intBuffer; + final ByteBuffer byteBuffer; + + private IntArrayPage(ByteBuffer byteBuffer) { + this.byteBuffer = byteBuffer; + this.intBuffer = byteBuffer.asIntBuffer(); + } + + public static IntArrayPage onHeap(int size) { + return new IntArrayPage(ByteBuffer.allocateDirect(WORD_SIZE*size)); + } + + public static IntArrayPage fromMmapReadOnly(Path file, long offset, int size) throws IOException { + return new IntArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_ONLY, StandardOpenOption.READ)); + } + + public static IntArrayPage fromMmapReadWrite(Path file, long offset, int size) throws IOException { + return new IntArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_WRITE, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE)); + } + + private static ByteBuffer mmapFile(Path file, long offset, int size, FileChannel.MapMode mode, OpenOption... openOptions) throws IOException { + try (var channel = (FileChannel) Files.newByteChannel(file, openOptions)) { + return channel.map(mode, WORD_SIZE*offset, (long) size*WORD_SIZE); + } + catch (IOException ex) { + throw new IOException("Failed to map file " + file + " (" + offset + ":" + size + ")", ex); + } + } + + + @Override + public int get(long at) { + return intBuffer.get((int) at); + } + + @Override + public void get(long start, long end, int[] buffer) { + intBuffer.get((int) start, buffer, 0, (int) (end - start)); + } + + @Override + public void set(long at, int val) { + intBuffer.put((int) at, val); + } + + @Override + public void set(long start, long end, IntBuffer buffer, int bufferStart) { + intBuffer.put((int) start, buffer, bufferStart, (int) (end-start)); + } + + @Override + public long size() { + return intBuffer.capacity(); + } + + public void increment(int at) { + set(at, get(at) + 1); + } + + @Override + public ByteBuffer getByteBuffer() { + return byteBuffer; + } + + @Override + public void write(Path filename) throws IOException { + try (var channel = (FileChannel) Files.newByteChannel(filename, StandardOpenOption.WRITE, StandardOpenOption.CREATE)) { + write(channel); + } + } + + @Override + public void force() { + if (byteBuffer instanceof MappedByteBuffer mb) { + mb.force(); + } + } + + @Override + public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException { + + int index = (int) (arrayStart * WORD_SIZE); + int length = (int) ((arrayEnd - arrayStart) * WORD_SIZE); + + var slice = byteBuffer.slice(index, length); + + long startPos = sourceStart * WORD_SIZE; + while (slice.position() < slice.capacity()) { + source.read(slice, startPos + slice.position()); + } + } + + @Override + public void advice(NativeIO.Advice advice) throws IOException { + NativeIO.madvise((MappedByteBuffer) byteBuffer, advice); + } + + @Override + public void advice(NativeIO.Advice advice, long start, long end) throws IOException { + NativeIO.madviseRange((MappedByteBuffer) byteBuffer, advice, (int) start, (int) (end-start)); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/LongArrayPage.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/LongArrayPage.java new file mode 100644 index 00000000..ed9e3c96 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/LongArrayPage.java @@ -0,0 +1,135 @@ +package nu.marginalia.util.array.page; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.trace.ArrayTrace; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.LongBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.OpenOption; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class LongArrayPage implements PartitionPage, LongArray { + + final ArrayTrace trace = ArrayTrace.get(this); + + final LongBuffer longBuffer; + final ByteBuffer byteBuffer; + + private LongArrayPage(ByteBuffer byteBuffer) { + this.byteBuffer = byteBuffer; + this.longBuffer = byteBuffer.asLongBuffer(); + } + + public static LongArrayPage onHeap(int size) { + return new LongArrayPage(ByteBuffer.allocateDirect(WORD_SIZE*size)); + } + + public static LongArrayPage fromMmapReadOnly(Path file, long offset, int size) throws IOException { + return new LongArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_ONLY, StandardOpenOption.READ)); + } + + public static LongArrayPage fromMmapReadWrite(Path file, long offset, int size) throws IOException { + return new LongArrayPage(mmapFile(file, offset, size, FileChannel.MapMode.READ_WRITE, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE)); + } + + private static ByteBuffer mmapFile(Path file, long offset, int size, FileChannel.MapMode mode, OpenOption... openOptions) throws IOException { + try (var channel = (FileChannel) Files.newByteChannel(file, openOptions)) { + return channel.map(mode, WORD_SIZE*offset, (long) size*WORD_SIZE); + } + catch (IOException ex) { + throw new IOException("Failed to map file " + file + " (" + offset + ":" + size + ")", ex); + } + } + + @Override + public long get(long at) { + try { + trace.touch(at); + + return longBuffer.get((int) at); + } + catch (IndexOutOfBoundsException ex) { + throw new IndexOutOfBoundsException("@" + at + "(" + 0 + ":" + longBuffer.capacity() + ")"); + } + } + + @Override + public void get(long start, long end, long[] buffer) { + trace.touch(start, end); + + longBuffer.get((int) start, buffer, 0, (int) (end - start)); + } + + @Override + public void set(long at, long val) { + trace.touch(at); + + longBuffer.put((int) at, val); + } + + @Override + public void set(long start, long end, LongBuffer buffer, int bufferStart) { + longBuffer.put((int) start, buffer, bufferStart, (int) (end-start)); + } + + @Override + public long size() { + return longBuffer.capacity(); + } + + public void increment(int at) { + set(at, get(at) + 1); + } + + @Override + public ByteBuffer getByteBuffer() { + return byteBuffer; + } + + @Override + public void write(Path filename) throws IOException { + try (var channel = (FileChannel) Files.newByteChannel(filename, StandardOpenOption.WRITE, StandardOpenOption.CREATE)) { + write(channel); + } + } + + @Override + public void force() { + if (byteBuffer instanceof MappedByteBuffer mb) { + mb.force(); + } + } + + @Override + public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException { + + trace.touch(arrayStart, arrayEnd); + + int index = (int) (arrayStart * WORD_SIZE); + int length = (int) ((arrayEnd - arrayStart) * WORD_SIZE); + + var slice = byteBuffer.slice(index, length); + + long startPos = sourceStart * WORD_SIZE; + while (slice.position() < slice.capacity()) { + source.read(slice, startPos + slice.position()); + } + } + + @Override + public void advice(NativeIO.Advice advice) throws IOException { + NativeIO.madvise((MappedByteBuffer) byteBuffer, advice); + } + + @Override + public void advice(NativeIO.Advice advice, long start, long end) throws IOException { + NativeIO.madviseRange((MappedByteBuffer) byteBuffer, advice, (int) start, (int) (end-start)); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java new file mode 100644 index 00000000..9fdbd21d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java @@ -0,0 +1,330 @@ +package nu.marginalia.util.array.page; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.IntArray; +import nu.marginalia.util.array.buffer.IntQueryBuffer; +import nu.marginalia.util.array.delegate.ReferenceImplIntArrayDelegate; +import nu.marginalia.util.array.functional.IntBinaryIOOperation; +import nu.marginalia.util.array.functional.IntIOTransformer; +import nu.marginalia.util.array.functional.IntTransformer; +import nu.marginalia.util.array.functional.LongIntConsumer; +import nu.marginalia.util.array.functor.IntIOFolder; +import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; + +import java.io.IOException; +import java.nio.IntBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class PagingIntArray extends AbstractPagingArray implements IntArray { + private final ReferenceImplIntArrayDelegate defaults; + + private PagingIntArray(ArrayPartitioningScheme partitioningScheme, + IntArrayPage[] pages, + long size) { + super(partitioningScheme, pages, size); + + defaults = new ReferenceImplIntArrayDelegate(this); + } + + public static IntArray newOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) { + if (cardinality < MAX_CONTINUOUS_SIZE) { + return IntArrayPage.onHeap((int) cardinality); + } + + return newPartitionedOnHeap(partitioningScheme, cardinality); + } + + public static IntArray newPartitionedOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) { + + IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(cardinality)]; + + for (int i = 0; i < pages.length; i++) { + pages[i] = IntArrayPage.onHeap(partitioningScheme.getRequiredPageSize(i, cardinality)); + } + + return new PagingIntArray(partitioningScheme, pages, cardinality); + } + + public static PagingIntArray mapFileReadOnly(ArrayPartitioningScheme partitioningScheme, Path file) + throws IOException + { + long sizeBytes = Files.size(file); + assert sizeBytes % WORD_SIZE == 0; + + long size = sizeBytes / WORD_SIZE; + + IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(size)]; + long offset = 0; + for (int i = 0; i < pages.length; i++) { + int partitionSize = partitioningScheme.getRequiredPageSize(i, size); + pages[i] = IntArrayPage.fromMmapReadOnly(file, offset, partitionSize); + offset += partitionSize; + } + + return new PagingIntArray(partitioningScheme, pages, size); + } + + + public static PagingIntArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file) + throws IOException + { + long sizeBytes = Files.size(file); + assert sizeBytes % LongArrayPage.WORD_SIZE == 0; + + long size = sizeBytes / LongArrayPage.WORD_SIZE; + + IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(size)]; + long offset = 0; + for (int i = 0; i < pages.length; i++) { + int partitionSize = partitioningScheme.getRequiredPageSize(i, size); + pages[i] = IntArrayPage.fromMmapReadWrite(file, offset, partitionSize); + offset += partitionSize; + } + + return new PagingIntArray(partitioningScheme, pages, size); + } + + public static PagingIntArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file, long size) + throws IOException + { + IntArrayPage[] pages = new IntArrayPage[partitioningScheme.getPartitions(size)]; + long offset = 0; + for (int i = 0; i < pages.length; i++) { + int partitionSize = partitioningScheme.getRequiredPageSize(i, size); + pages[i] = IntArrayPage.fromMmapReadWrite(file, offset, partitionSize); + offset += partitionSize; + } + + return new PagingIntArray(partitioningScheme, pages, size); + } + + public int get(long pos) { + int page = partitioningScheme.getPage(pos); + int offset = partitioningScheme.getOffset(pos); + + try { + return pages[page].get(partitioningScheme.getOffset(pos)); + } + catch (IndexOutOfBoundsException ex) { + throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")"); + } + } + + @Override + public void get(long start, long end, int[] buffer) { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + pages[partitioningScheme.getPage(start)].get(sOff, eOff, buffer); + } + else { + defaults.get(start, end, buffer); + } + } + + @Override + public void set(long pos, int value) { + int page = partitioningScheme.getPage(pos); + int offset = partitioningScheme.getOffset(pos); + try { + pages[page].set(offset, value); + } + catch (IndexOutOfBoundsException ex) { + throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")"); + } + } + + @Override + public long size() { + return size; + } + + @Override + public void increment(long pos) { + int page = partitioningScheme.getPage(pos); + int offset = partitioningScheme.getOffset(pos); + + try { + pages[page].increment(partitioningScheme.getOffset(pos)); + } + catch (IndexOutOfBoundsException ex) { + throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")"); + } + } + + @Override + public void forEach(long start, long end, LongIntConsumer consumer) { + delegateToEachPage(start, end, (page, s, e) -> page.forEach(s, e, consumer)); + } + + @Override + public void fill(long fromIndex, long toIndex, int value) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + pages[partitioningScheme.getPage(fromIndex)].fill(sOff, eOff, value); + } + else if (toIndex >= fromIndex) { + delegateToEachPage(fromIndex, toIndex, (page, s, e) -> page.fill(s, e, value)); + } + } + + @Override + public void transformEach(long start, long end, IntTransformer transformer) { + delegateToEachPage(start, end, (page, s, e) -> page.transformEach(s, e, transformer)); + } + + @Override + public void transformEachIO(long start, long end, IntIOTransformer transformer) throws IOException { + delegateToEachPageIO(start, end, (page, s, e) -> page.transformEachIO(s, e, transformer)); + } + + @Override + public int foldIO(int zero, long start, long end, IntBinaryIOOperation operator) throws IOException { + var folder = new IntIOFolder(zero, operator); + + delegateToEachPageIO(start, end, folder); + + return folder.acc; + } + + @Override + public long linearSearch(int key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearch(key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.linearSearch(key, fromIndex, toIndex); + } + } + + @Override + public long binarySearch(int key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearch(key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.binarySearch(key, fromIndex, toIndex); + } + } + + @Override + public long binarySearchUpperBound(int key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchUpperBound(key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.binarySearchUpperBound(key, fromIndex, toIndex); + } + } + + @Override + public void retain(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + if (partitioningScheme.isSamePage(searchStart, searchEnd)) { + int sOff = partitioningScheme.getOffset(searchStart); + int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(searchStart)].retain(buffer, boundary, sOff, eOff); + } + } + else { + defaults.retain(buffer, boundary, searchStart, searchEnd); + } + } + + + @Override + public void reject(IntQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + if (partitioningScheme.isSamePage(searchStart, searchEnd)) { + int sOff = partitioningScheme.getOffset(searchStart); + int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(searchStart)].reject(buffer, boundary, sOff, eOff); + } + } + else { + defaults.reject(buffer, boundary, searchStart, searchEnd); + } + } + + public void write(Path fileName) throws IOException { + try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { + for (int i = 0; i < pages.length; i++) { + pages[i].write(channel); + } + channel.force(false); + } + } + + public long getSize() { + if (size < 0) { + throw new UnsupportedOperationException(); + } + return size; + } + + @Override + public void force() { + for (var page : pages) { + page.force(); + } + } + + @Override + public void advice(NativeIO.Advice advice) throws IOException { + for (var page : pages) { + page.advice(advice); + } + } + + @Override + public void advice(NativeIO.Advice advice, long start, long end) throws IOException { + delegateToEachPageIO(start, end, (a,s,e) -> a.advice(advice, s, e)); + } + + + public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException { + assert arrayEnd >= arrayStart; + + int page = partitioningScheme.getPage(arrayStart); + + long endPos; + + for (long pos = arrayStart; pos < arrayEnd; pos = endPos) { + endPos = partitioningScheme.getPageEnd(pos, arrayEnd); + + int sOff = partitioningScheme.getOffset(pos); + int eOff = partitioningScheme.getEndOffset(pos, endPos); + + pages[page++].transferFrom(source, sourceStart, sOff, eOff); + + sourceStart+=(endPos - pos); + } + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java new file mode 100644 index 00000000..e7f0a983 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java @@ -0,0 +1,498 @@ +package nu.marginalia.util.array.page; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.array.delegate.ReferenceImplLongArrayDelegate; +import nu.marginalia.util.array.functional.LongBinaryIOOperation; +import nu.marginalia.util.array.functional.LongIOTransformer; +import nu.marginalia.util.array.functional.LongLongConsumer; +import nu.marginalia.util.array.functional.LongTransformer; +import nu.marginalia.util.array.functor.LongIOFolder; +import nu.marginalia.util.array.scheme.ArrayPartitioningScheme; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class PagingLongArray extends AbstractPagingArray implements LongArray { + private final ReferenceImplLongArrayDelegate defaults; + + private PagingLongArray(ArrayPartitioningScheme partitioningScheme, LongArrayPage[] pages, long size) { + super(partitioningScheme, pages, size); + defaults = new ReferenceImplLongArrayDelegate(this); + } + + public static LongArray newOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) { + return newPartitionedOnHeap(partitioningScheme, cardinality); + } + + public static LongArray newPartitionedOnHeap(ArrayPartitioningScheme partitioningScheme, long cardinality) { + LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(cardinality)]; + + for (int i = 0; i < pages.length; i++) { + pages[i] = LongArrayPage.onHeap(partitioningScheme.getRequiredPageSize(i, cardinality)); + } + + return new PagingLongArray(partitioningScheme, pages, cardinality); + } + + public static PagingLongArray mapFileReadOnly(ArrayPartitioningScheme partitioningScheme, Path file) + throws IOException + { + long sizeBytes = Files.size(file); + assert sizeBytes % WORD_SIZE == 0; + + long size = sizeBytes / WORD_SIZE; + + LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(size)]; + long offset = 0; + for (int i = 0; i < pages.length; i++) { + int partitionSize = partitioningScheme.getRequiredPageSize(i, size); + pages[i] = LongArrayPage.fromMmapReadOnly(file, offset, partitionSize); + offset += partitionSize; + } + + return new PagingLongArray(partitioningScheme, pages, size); + } + + public static PagingLongArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file) + throws IOException + { + long sizeBytes = Files.size(file); + assert sizeBytes % WORD_SIZE == 0; + + long size = sizeBytes / WORD_SIZE; + + LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(size)]; + long offset = 0; + for (int i = 0; i < pages.length; i++) { + int partitionSize = partitioningScheme.getRequiredPageSize(i, size); + pages[i] = LongArrayPage.fromMmapReadWrite(file, offset, partitionSize); + offset += partitionSize; + } + + return new PagingLongArray(partitioningScheme, pages, size); + } + + public static PagingLongArray mapFileReadWrite(ArrayPartitioningScheme partitioningScheme, Path file, long size) + throws IOException + { + LongArrayPage[] pages = new LongArrayPage[partitioningScheme.getPartitions(size)]; + long offset = 0; + for (int i = 0; i < pages.length; i++) { + int partitionSize = partitioningScheme.getRequiredPageSize(i, size); + pages[i] = LongArrayPage.fromMmapReadWrite(file, offset, partitionSize); + offset += partitionSize; + } + + return new PagingLongArray(partitioningScheme, pages, size); + } + + @Override + public long get(long pos) { + int page = partitioningScheme.getPage(pos); + int offset = partitioningScheme.getOffset(pos); + + try { + return pages[page].get(partitioningScheme.getOffset(pos)); + } + catch (IndexOutOfBoundsException ex) { + throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")"); + } + } + + @Override + public void get(long start, long end, long[] buffer) { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + pages[partitioningScheme.getPage(start)].get(sOff, eOff, buffer); + } + else { + defaults.get(start, end, buffer); + } + } + + @Override + public void set(long pos, long value) { + int page = partitioningScheme.getPage(pos); + int offset = partitioningScheme.getOffset(pos); + try { + pages[page].set(offset, value); + } + catch (IndexOutOfBoundsException ex) { + throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")"); + } + } + + @Override + public long size() { + return size; + } + + @Override + public void increment(long pos) { + int page = partitioningScheme.getPage(pos); + int offset = partitioningScheme.getOffset(pos); + + try { + pages[page].increment(partitioningScheme.getOffset(pos)); + } + catch (IndexOutOfBoundsException ex) { + throw new IndexOutOfBoundsException("Index out of bounds for " + pos + " => (" + page + ":" + offset + ")"); + } + } + + @Override + public void forEach(long start, long end, LongLongConsumer transformer) { + delegateToEachPage(start, end, (page, s, e) -> page.forEach(s, e, transformer)); + } + + @Override + public void fill(long fromIndex, long toIndex, long value) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + pages[partitioningScheme.getPage(fromIndex)].fill(sOff, eOff, value); + } + else { + delegateToEachPage(fromIndex, toIndex, (page, s, e) -> page.fill(s, e, value)); + } + } + + @Override + public void transformEach(long start, long end, LongTransformer transformer) { + delegateToEachPage(start, end, (page, s, e) -> page.transformEach(s, e, transformer)); + } + + @Override + public void transformEachIO(long start, long end, LongIOTransformer transformer) throws IOException { + delegateToEachPageIO(start, end, (page, s, e) -> page.transformEachIO(s, e, transformer)); + } + + @Override + public long foldIO(long zero, long start, long end, LongBinaryIOOperation operator) throws IOException { + var folder = new LongIOFolder(zero, operator); + + delegateToEachPageIO(start, end, folder); + + return folder.acc; + } + + @Override + public long linearSearch(long key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearch(key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.linearSearch(key, fromIndex, toIndex); + } + } + + @Override + public long linearSearchN(int sz, long key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearchN(sz, key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.linearSearchN(sz, key, fromIndex, toIndex); + } + } + + @Override + public long binarySearch(long key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearch(key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.binarySearch(key, fromIndex, toIndex); + } + } + @Override + public long binarySearchN(int sz, long key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchN(sz, key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.binarySearchN(sz, key, fromIndex, toIndex); + } + } + @Override + public long binarySearchUpperBound(long key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchUpperBound(key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.binarySearchUpperBound(key, fromIndex, toIndex); + } + } + + @Override + public long linearSearchUpperBound(long key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].linearSearchUpperBound(key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.linearSearchUpperBound(key, fromIndex, toIndex); + } + } + @Override + public long binarySearchUpperBoundN(int sz, long key, long fromIndex, long toIndex) { + if (partitioningScheme.isSamePage(fromIndex, toIndex)) { + int sOff = partitioningScheme.getOffset(fromIndex); + int eOff = partitioningScheme.getEndOffset(fromIndex, toIndex); + + long ret = pages[partitioningScheme.getPage(fromIndex)].binarySearchUpperBoundN(sz, key, sOff, eOff); + + return translateSearchResultsFromPage(fromIndex, ret); + } + else { + return defaults.binarySearchUpperBoundN(sz, key, fromIndex, toIndex); + } + } + + @Override + public void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + if (partitioningScheme.isSamePage(searchStart, searchEnd)) { + int sOff = partitioningScheme.getOffset(searchStart); + int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(searchStart)].retain(buffer, boundary, sOff, eOff); + } + } + else { + defaults.retain(buffer, boundary, searchStart, searchEnd); + } + } + + @Override + public void retainN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) { + if (partitioningScheme.isSamePage(searchStart, searchEnd)) { + int sOff = partitioningScheme.getOffset(searchStart); + int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(searchStart)].retainN(buffer, sz, boundary, sOff, eOff); + } + } + else { + defaults.retainN(buffer, sz, boundary, searchStart, searchEnd); + } + } + + @Override + public void reject(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) { + if (partitioningScheme.isSamePage(searchStart, searchEnd)) { + int sOff = partitioningScheme.getOffset(searchStart); + int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(searchStart)].reject(buffer, boundary, sOff, eOff); + } + } + else { + defaults.reject(buffer, boundary, searchStart, searchEnd); + } + } + + @Override + public void rejectN(LongQueryBuffer buffer, int sz, long boundary, long searchStart, long searchEnd) { + if (partitioningScheme.isSamePage(searchStart, searchEnd)) { + int sOff = partitioningScheme.getOffset(searchStart); + int eOff = partitioningScheme.getEndOffset(searchStart, searchEnd); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(searchStart)].rejectN(buffer, sz, boundary, sOff, eOff); + } + } + else { + defaults.rejectN(buffer, sz, boundary, searchStart, searchEnd); + } + } + + @Override + public void insertionSort(long start, long end) { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(start)].insertionSort(sOff, eOff); + } + } + else { + defaults.insertionSort(start, end); + } + } + + @Override + public void insertionSortN(int sz, long start, long end) { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(start)].insertionSortN(sz, sOff, eOff); + } + } + else { + defaults.insertionSortN(sz, start, end); + } + } + + @Override + public void quickSort(long start, long end) { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(start)].quickSort(sOff, eOff); + } + } + else { + defaults.quickSort(start, end); + } + } + + @Override + public void quickSortN(int sz, long start, long end) { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(start)].quickSortN(sz, sOff, eOff); + } + } + else { + defaults.quickSortN(sz, start, end); + } + } + + @Override + public void mergeSort(long start, long end, Path tempDir) throws IOException { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(start)].mergeSort(sOff, eOff, tempDir); + } + } + else { + defaults.mergeSort(start, end, tempDir); + } + } + + + @Override + public void mergeSortN(int sz, long start, long end, Path tempDir) throws IOException { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(start)].mergeSortN(sz, sOff, eOff, tempDir); + } + } + else { + defaults.mergeSortN(sz, start, end, tempDir); + } + } + + public void write(Path fileName) throws IOException { + try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { + for (int i = 0; i < pages.length; i++) { + pages[i].write(channel); + } + channel.force(false); + } + } + + public long getSize() { + if (size < 0) { + throw new UnsupportedOperationException(); + } + return size; + } + + @Override + public void force() { + for (var page : pages) { + page.force(); + } + } + + @Override + public void advice(NativeIO.Advice advice) throws IOException { + for (var page : pages) { + page.advice(advice); + } + } + + @Override + public void advice(NativeIO.Advice advice, long start, long end) throws IOException { + delegateToEachPageIO(start, end, (a,s,e) -> a.advice(advice, s, e)); + } + + + public void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException { + assert arrayEnd >= arrayStart; + + int page = partitioningScheme.getPage(arrayStart); + + long endPos; + + for (long pos = arrayStart; pos < arrayEnd; pos = endPos) { + endPos = partitioningScheme.getPageEnd(pos, arrayEnd); + + int sOff = partitioningScheme.getOffset(pos); + int eOff = partitioningScheme.getEndOffset(pos, endPos); + + pages[page++].transferFrom(source, sourceStart, sOff, eOff); + + sourceStart+=(endPos - pos); + } + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PartitionPage.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PartitionPage.java new file mode 100644 index 00000000..c324157c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PartitionPage.java @@ -0,0 +1,22 @@ +package nu.marginalia.util.array.page; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; + +public interface PartitionPage { + + default void write(FileChannel channel) throws IOException { + var byteBuffer = getByteBuffer(); + + byteBuffer.clear(); + + while (byteBuffer.position() < byteBuffer.limit()) { + channel.write(byteBuffer); + } + + byteBuffer.clear(); + } + + ByteBuffer getByteBuffer(); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/ArrayPartitioningScheme.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/ArrayPartitioningScheme.java new file mode 100644 index 00000000..a8063a17 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/ArrayPartitioningScheme.java @@ -0,0 +1,51 @@ +package nu.marginalia.util.array.scheme; + +public interface ArrayPartitioningScheme { + + static ArrayPartitioningScheme forPartitionSize(int size) { + if (Integer.highestOneBit(size) == size) { + return new PowerOf2PartitioningScheme(size); + } + else { + return new SequentialPartitioningScheme(size); + } + } + static int getRequiredPartitions(long cardinality, int partitionSize) { + return (int) (cardinality / partitionSize + Long.signum(cardinality % partitionSize)); + } + + + int getPartitions(long cardinality); + + int getPage(long at); + + boolean isSamePage(long a, long b); + + /** Get the page offset corresponding to at */ + int getOffset(long at); + + /** Variant of getOffset that doesn't wrap around the page boundary, necessary when + * translating an exclusive end offset that getOffset(...) will translate to 0 and consider + * part of the next page. + * + * It is also necessary to consider the start offset to determine when the end offset + * + */ + default int getEndOffset(long start, long end) { + if (end == 0 || end <= start) + return getOffset(end); + + return 1 + getOffset(end - 1); + } + + /** Get the end of the buffer containing at, or endTotal, whichever is smaller + */ + long getPageEnd(long at, long endTotal); + + /** + * toRealIndex(getBuffer(val), getOffset(val)) = val + */ + long toRealIndex(int buffer, int offset); + + int getRequiredPageSize(int buffer, long cardinality); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/PowerOf2PartitioningScheme.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/PowerOf2PartitioningScheme.java new file mode 100644 index 00000000..20bb453e --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/PowerOf2PartitioningScheme.java @@ -0,0 +1,60 @@ +package nu.marginalia.util.array.scheme; + +public class PowerOf2PartitioningScheme implements ArrayPartitioningScheme { + final int partitionSize; + final long offsetMask; + final long bufferMask; + final int pageShift; + + public PowerOf2PartitioningScheme(int partitionSize) { + assert partitionSize == Integer.highestOneBit(partitionSize); + + this.partitionSize = partitionSize; + + offsetMask = partitionSize - 1; + bufferMask = ~offsetMask; + pageShift = Integer.numberOfTrailingZeros(partitionSize); + } + + @Override + public int getPartitions(long cardinality) { + return ArrayPartitioningScheme.getRequiredPartitions(cardinality, partitionSize); + } + + @Override + public int getPage(long at) { // very hot code + return (int) (at >>> pageShift); + } + + @Override + public int getOffset(long at) { // very hot code + return (int) (at & offsetMask); + } + + @Override + public boolean isSamePage(long a, long b) { // hot code + return 0 == ((a ^ b) & bufferMask); + } + + @Override + public long getPageEnd(long at, long endTotal) { + return Math.min(endTotal, partitionSize * (1L + getPage(at))); + } + + @Override + public long toRealIndex(int buffer, int offset) { + return offset + (long) buffer * partitionSize; + } + + @Override + public int getRequiredPageSize(int buffer, long cardinality) { + + if ((long) (1 + buffer) * partitionSize <= cardinality) { + return partitionSize; + } + + return (int) (cardinality % partitionSize); + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/SequentialPartitioningScheme.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/SequentialPartitioningScheme.java new file mode 100644 index 00000000..19af52d1 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/scheme/SequentialPartitioningScheme.java @@ -0,0 +1,56 @@ +package nu.marginalia.util.array.scheme; + +public class SequentialPartitioningScheme implements ArrayPartitioningScheme { + + final int partitionSize; + + public SequentialPartitioningScheme(int partitionSize) { + this.partitionSize = partitionSize; + } + + public static int getRequiredPartitions(long cardinality, int partitionSize) { + return (int) (cardinality / partitionSize + Long.signum(cardinality % partitionSize)); + } + + @Override + public int getPartitions(long cardinality) { + return getRequiredPartitions(cardinality, partitionSize); + } + + @Override + public int getPage(long at) { + return (int) (at / partitionSize); + } + + public long getPageEnd(long at, long endTotal) { + return Math.min(endTotal, partitionSize * (1L + getPage(at))); + } + + + @Override + public boolean isSamePage(long a, long b) { + return (int) (a / partitionSize) == (int)(b/partitionSize); + } + + @Override + public int getOffset(long at) { + return (int) (at % partitionSize); + } + + public long toRealIndex(int buffer, int offset) { + return offset + (long) buffer * partitionSize; + } + + + + @Override + public int getRequiredPageSize(int buffer, long cardinality) { + + if ((long) (1 + buffer) * partitionSize <= cardinality) { + return partitionSize; + } + return (int) (cardinality % partitionSize); + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTrace.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTrace.java new file mode 100644 index 00000000..38e08ede --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTrace.java @@ -0,0 +1,22 @@ +package nu.marginalia.util.array.trace; + +import nu.marginalia.util.array.LongArray; + +import java.nio.file.Path; +import java.util.Optional; + +public interface ArrayTrace { + void touch(long address); + void touch(long start, long end); + + FileTrace fileTrace = Optional.ofNullable(System.clearProperty("nu.marginalia.util.array.trace")).map(Path::of).map(FileTrace::new).orElseGet(FileTrace::new); + NullTrace nullTrace = new NullTrace(); + static ArrayTrace get(LongArray array) { + + if (fileTrace == null) { + return nullTrace; + } + + return fileTrace.forArray(array); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTraceViz.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTraceViz.java new file mode 100644 index 00000000..babf727f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/ArrayTraceViz.java @@ -0,0 +1,115 @@ +package nu.marginalia.util.array.trace; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; + +import static java.awt.image.BufferedImage.TYPE_INT_RGB; + +public class ArrayTraceViz { + + + private static final int BLOCK_SIZE_WORDS = 512; + + public static void main(String[] args) throws IOException { + Path inputFile = Path.of("/home/vlofgren/array-trace.log"); + Map sizes = new HashMap<>(); + Map> rows = new HashMap<>(); + + try (var lines = Files.lines(inputFile)) { + lines.map(line -> line.split("\\s")).forEach(parts -> { + int block = Integer.parseInt(parts[1]); + int start = Integer.parseInt(parts[2]); + int end = Integer.parseInt(parts[3]); + + sizes.merge(block, end, Integer::max); + + var rowSet = rows.computeIfAbsent(block, b -> new HashSet<>()); + for (int b = start; b < end; b += BLOCK_SIZE_WORDS) { + rowSet.add(b/BLOCK_SIZE_WORDS); + } + }); + } + + Map> rowToY = new HashMap<>(); + + rows.forEach((row, vals) -> { + var map = new HashMap(vals.size()); + rowToY.put(row, map); + var list = new ArrayList<>(vals); + + list.stream().sorted().forEach(val -> map.put(val, map.size())); + }); + + Map cols = new HashMap<>(); + sizes.keySet().forEach(key -> cols.put(key, cols.size())); + + int width = cols.size() * (BLOCK_SIZE_WORDS+4); + int height = 640; + + var bi = new BufferedImage(width, height, TYPE_INT_RGB); + + AtomicInteger iv = new AtomicInteger(); + + try (var lines = Files.lines(inputFile)) { + lines.forEach(line -> { + String[] parts = line.split("\\s"); + + long time = Long.parseLong(parts[0]); + int block = Integer.parseInt(parts[1]); + int start = Integer.parseInt(parts[2]); + int end = Integer.parseInt(parts[3]); + + for (int p = start; p < end; p++) { + int x0 = (4+BLOCK_SIZE_WORDS) * cols.get(block); + int x = x0 + (p%BLOCK_SIZE_WORDS); + int y = rowToY.get(block).get(p/BLOCK_SIZE_WORDS); + + if (y >= 640) { + continue; + } + + if (0 == bi.getRGB(x, y)) { + for (int x2 = 0; x2 < BLOCK_SIZE_WORDS; x2++) { + if (0 == bi.getRGB(x0 + x2, y)) { + bi.setRGB(x0 + x2, y, 0xC0C0C0); + } + } + } + + System.out.println(x + "," + y); + bi.setRGB(x, y, (int) (0xFFFFFFL)); + } + + try { + if ((iv.incrementAndGet() % 4) == 0) { + ImageIO.write(bi, "png", new File("/tmp/test" + (time * Long.signum(time)) + " .png")); + for (int x = 0; x < width; x++) { + for (int y = 0; y < height; y++) { + int val = bi.getRGB(x, y); + int nval = (val&0xFF) - 1; + if (nval > 64) { + bi.setRGB(x, y, nval | (nval<<8) | (nval << 16)); + } + else if ((val&0xFFFFFF) != 0) { + bi.setRGB(x, y, 64); + } + } + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + + + } + + record ArrayPage(int id, int size) {} +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/FileTrace.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/FileTrace.java new file mode 100644 index 00000000..b1fe9c57 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/FileTrace.java @@ -0,0 +1,52 @@ +package nu.marginalia.util.array.trace; + +import nu.marginalia.util.array.LongArray; + +import java.io.IOException; +import java.io.PrintStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class FileTrace { + PrintStream traceWriter; + static volatile boolean doTrace = false; + + public FileTrace(Path file) { + try { + traceWriter = new PrintStream(Files.newOutputStream(file, StandardOpenOption.WRITE, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } + + public FileTrace() { + this(Path.of("/tmp/array-trace.log")); + } + + public static void setTrace(boolean val) { + doTrace = val; + } + + public void trace(int source, long start, long end) { + if (doTrace) { + traceWriter.printf("%d %d %d %d\n", System.nanoTime(), source, start, end); + } + } + + public ArrayTrace forArray(LongArray array) { + return new ArrayTrace() { + final int code = array.hashCode(); + + @Override + public void touch(long address) { + trace(code, address, address+1); + } + + @Override + public void touch(long start, long end) { + trace(code, start, end); + } + }; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/NullTrace.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/NullTrace.java new file mode 100644 index 00000000..20e2125f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/trace/NullTrace.java @@ -0,0 +1,11 @@ +package nu.marginalia.util.array.trace; + +public class NullTrace implements ArrayTrace { + + @Override + public void touch(long address) {} + + @Override + public void touch(long start, long end) {} + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java index f21eeb9d..77a97e87 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java @@ -1,8 +1,8 @@ package nu.marginalia.util.btree; +import nu.marginalia.util.array.LongArray; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; -import nu.marginalia.util.multimap.MultimapFileLongSlice; /* * End-of-page mark that's used as a sentinel to verify that @@ -12,14 +12,16 @@ import nu.marginalia.util.multimap.MultimapFileLongSlice; */ public class BTreeDogEar { - private MultimapFileLongSlice sentinelSlice; + private LongArray sentinelSlice; - public BTreeDogEar(BTreeContext ctx, BTreeHeader header, MultimapFileLongSlice base) { + public BTreeDogEar(BTreeContext ctx, BTreeHeader header, LongArray base) { if (header.numEntries() > 3) { - sentinelSlice = base.atOffset((long) header.numEntries() * ctx.entrySize() - 3); - sentinelSlice.put(0, 4L); - sentinelSlice.put(1, 5L); - sentinelSlice.put(2, 1L); + sentinelSlice = base.range( + (long) header.numEntries() * ctx.entrySize() - 3, + (long) header.numEntries() * ctx.entrySize()); + sentinelSlice.set(0, 4L); + sentinelSlice.set(1, 5L); + sentinelSlice.set(2, 1L); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java index be9de7cc..f8bdd1f6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java @@ -1,43 +1,39 @@ package nu.marginalia.util.btree; -import it.unimi.dsi.fastutil.longs.LongLongImmutablePair; import lombok.SneakyThrows; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.algo.LongArraySearch; +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.array.delegate.ShiftedLongArray; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.util.multimap.MultimapSearcher; import static java.lang.Math.min; -public class BTreeReader { +public class BTreeReader implements BTreeReaderIf { + + private final LongArray index; + private final ShiftedLongArray data; - private final MultimapFileLong file; public final BTreeContext ctx; - - private final MultimapSearcher indexSearcher; - private final MultimapSearcher dataSearcher; private final BTreeHeader header; - public BTreeReader(MultimapFileLong file, BTreeContext ctx, BTreeHeader header) { - this.file = file; - this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1); - this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize()); - - this.ctx = ctx; - this.header = header; - } - - public BTreeReader(MultimapFileLong file, BTreeContext ctx, long offset) { - this.file = file; - this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1); - this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize()); + private final long dataBlockEnd; + public BTreeReader(LongArray file, BTreeContext ctx, long offset) { this.ctx = ctx; this.header = createHeader(file, offset); + + dataBlockEnd = (long) ctx.entrySize() * header.numEntries(); + index = file.range(header.indexOffsetLongs(), header.dataOffsetLongs()); + data = file.range(header.dataOffsetLongs(), header.dataOffsetLongs() + dataBlockEnd); + } - public static BTreeHeader createHeader(MultimapFileLong file, long fileOffset) { - return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2)); + public static BTreeHeader createHeader(LongArray file, long fileOffset) { + long[] parts = new long[3]; + file.get(fileOffset, fileOffset+3, parts); + return new BTreeHeader(parts[0], parts[1], parts[2]); } public BTreeHeader getHeader() { @@ -49,7 +45,7 @@ public class BTreeReader { } @SneakyThrows - public void retainEntries(BTreeQueryBuffer buffer) { + public void retainEntries(LongQueryBuffer buffer) { if (header.layers() == 0) { BTreePointer pointer = new BTreePointer(header); while (buffer.hasMore()) { @@ -60,7 +56,7 @@ public class BTreeReader { } @SneakyThrows - public void rejectEntries(BTreeQueryBuffer buffer) { + public void rejectEntries(LongQueryBuffer buffer) { if (header.layers() == 0) { BTreePointer pointer = new BTreePointer(header); while (buffer.hasMore()) { @@ -70,13 +66,13 @@ public class BTreeReader { rejectSingle(buffer); } - private void retainSingle(BTreeQueryBuffer buffer) { + private void retainSingle(LongQueryBuffer buffer) { BTreePointer pointer = new BTreePointer(header); for (; buffer.hasMore(); pointer.resetToRoot()) { - long val = buffer.currentValue() & ctx.equalityMask(); + long val = buffer.currentValue(); if (!pointer.walkToData(val)) { buffer.rejectAndAdvance(); @@ -87,12 +83,12 @@ public class BTreeReader { } } - private void rejectSingle(BTreeQueryBuffer buffer) { + private void rejectSingle(LongQueryBuffer buffer) { BTreePointer pointer = new BTreePointer(header); for (; buffer.hasMore(); pointer.resetToRoot()) { - long val = buffer.currentValue() & ctx.equalityMask(); + long val = buffer.currentValue(); if (pointer.walkToData(val) && pointer.containsData(val)) { buffer.rejectAndAdvance(); @@ -108,31 +104,53 @@ public class BTreeReader { * * @return file offset of entry matching keyRaw, negative if absent */ - public long findEntry(final long keyRaw) { - final long key = keyRaw & ctx.equalityMask(); - + public long findEntry(final long key) { BTreePointer ip = new BTreePointer(header); while (!ip.isDataLayer()) - ip.walkToChild(key); + if (!ip.walkToChild(key)) + return -1; return ip.findData(key); } - public void readData(long[] data, int n, long pos) { - file.read(data, n, header.dataOffsetLongs() + pos); + public void readData(long[] buf, int n, long pos) { + data.get(pos, pos + n, buf); } - public long[] queryData(long[] urls, int offset) { + public long[] queryData(long[] keys, int offset) { BTreePointer pointer = new BTreePointer(header); - long[] ret = new long[urls.length]; + long[] ret = new long[keys.length]; - for (int i = 0; i < urls.length; i++, pointer.resetToRoot()) { - if (pointer.walkToData(urls[i])) { - long dataAddress = pointer.findData(urls[i]); - if (dataAddress >= 0) { - ret[i] = file.get(dataAddress + offset); + // this function could be re-written like retain() and would be + // much faster + + if (header.layers() == 0) { + long searchStart = 0; + for (int i = 0; i < keys.length; i++) { + long key = keys[i]; + searchStart = data.binarySearchN(ctx.entrySize(), key, searchStart, data.size); + if (searchStart < 0) { + searchStart = LongArraySearch.decodeSearchMiss(searchStart); + } + else { + ret[i] = data.get(searchStart + offset); + } + } + + } + else { + for (int i = 0; i < keys.length; i++) { + if (i > 0) { + pointer.resetToRoot(); + } + + if (pointer.walkToData(keys[i])) { + long dataAddress = pointer.findData(keys[i]); + if (dataAddress >= 0) { + ret[i] = data.get(dataAddress + offset); + } } } } @@ -140,25 +158,6 @@ public class BTreeReader { return ret; } - /** Find the range of values so that prefixStart <= n < prefixNext */ - public LongLongImmutablePair getRangeForPrefix(long prefixStart, long prefixNext) { - long lowerBoundStart = lowerBound(prefixStart); - long lowerBoundEnd = lowerBound(prefixNext); - - return new LongLongImmutablePair(lowerBoundStart, lowerBoundEnd); - } - - private long lowerBound(long key) { - key &= ctx.equalityMask(); - - BTreePointer ip = new BTreePointer(header); - - while (!ip.isDataLayer()) - ip.walkToChild(key); - - return ip.findDataLower(key); - } - private class BTreePointer { private final long[] layerOffsets; @@ -190,18 +189,13 @@ public class BTreeReader { } public boolean walkToChild(long key) { - final long indexAddress = header.indexOffsetLongs(); - final long indexLayerBlockOffset = layerOffsets[layer] + offset; + final long searchStart = layerOffsets[layer] + offset; - final long searchStart = indexAddress + indexLayerBlockOffset; - final long nextLayerOffset = (int)(indexSearcher.binarySearchLower(key, searchStart, ctx.BLOCK_SIZE_WORDS()) - searchStart); - - if (nextLayerOffset < 0) - return false; + final long nextLayerOffset = (int) index.binarySearchUpperBound(key, searchStart, searchStart + ctx.BLOCK_SIZE_WORDS()) - searchStart; layer --; - boundary = file.get(searchStart + offset); + boundary = index.get(searchStart + nextLayerOffset); offset = ctx.BLOCK_SIZE_WORDS() * (offset + nextLayerOffset); return true; @@ -225,41 +219,39 @@ public class BTreeReader { } public long findData(long key) { - if (layer > 0) { + if (layer >= 0) { throw new IllegalStateException("Looking for data in an index layer"); } - long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize(); - int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS()); + long searchStart = offset * ctx.entrySize(); + long remainingTotal = dataBlockEnd - offset * ctx.entrySize(); + long remainingBlock; - return dataSearcher.binarySearch(key, searchStart, numEntries); + remainingBlock = (layerOffsets.length == 0) + ? remainingTotal + : (long) ctx.BLOCK_SIZE_WORDS() * ctx.entrySize(); + + long searchEnd = searchStart + (int) min(remainingTotal, remainingBlock); + + return data.binarySearchN(ctx.entrySize(), key, searchStart, searchEnd); } - public long findDataLower(long key) { - if (layer > 0) { - throw new IllegalStateException("Looking for data in an index layer"); - } - - long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize(); - int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS()); - - return dataSearcher.binarySearchLower(key, searchStart, numEntries); - } - - public void retainData(BTreeQueryBuffer buffer) { + public void retainData(LongQueryBuffer buffer) { long dataOffset = findData(buffer.currentValue()); if (dataOffset >= 0) { buffer.retainAndAdvance(); - long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize(); - long relOffset = dataOffset - blockBase; + if (buffer.hasMore() && buffer.currentValue() <= boundary) { + long blockBase = offset * ctx.entrySize(); + long relOffset = dataOffset - blockBase; - int numEntries = - min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize(); + long remainingTotal = dataBlockEnd - dataOffset; + long remainingBlock = ctx.BLOCK_SIZE_WORDS() - relOffset; - if (buffer.currentValue() <= boundary) { - file.retain(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize()); + long searchEnd = dataOffset + (int) min(remainingTotal, remainingBlock); + + data.range(dataOffset, searchEnd).retainN(buffer, ctx.entrySize(), boundary); } } else { @@ -268,20 +260,22 @@ public class BTreeReader { } - public void rejectData(BTreeQueryBuffer buffer) { + public void rejectData(LongQueryBuffer buffer) { long dataOffset = findData(buffer.currentValue()); if (dataOffset >= 0) { buffer.rejectAndAdvance(); - long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize(); - long relOffset = dataOffset - blockBase; + if (buffer.hasMore() && buffer.currentValue() <= boundary) { + long blockBase = offset * ctx.entrySize(); + long relOffset = dataOffset - blockBase; - int numEntries = - min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize(); + long remainingTotal = dataBlockEnd - dataOffset; + long remainingBlock = ctx.BLOCK_SIZE_WORDS() - relOffset; - if (buffer.currentValue() <= boundary) { - file.reject(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize()); + long searchEnd = dataOffset + (int) min(remainingTotal, remainingBlock); + + data.range(dataOffset, searchEnd).rejectN(buffer, ctx.entrySize(), boundary); } } else { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReaderIf.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReaderIf.java new file mode 100644 index 00000000..c4b40386 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReaderIf.java @@ -0,0 +1,21 @@ +package nu.marginalia.util.btree; + +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.btree.model.BTreeHeader; + +public interface BTreeReaderIf { + BTreeHeader getHeader(); + + int numEntries(); + + void retainEntries(LongQueryBuffer buffer); + + void rejectEntries(LongQueryBuffer buffer); + + long findEntry(long keyRaw); + + void readData(long[] data, int n, long pos); + + long[] queryData(long[] urls, int offset); + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java index bb68a3c1..a6ad6f91 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java @@ -1,8 +1,8 @@ package nu.marginalia.util.btree; +import nu.marginalia.util.array.LongArray; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; -import nu.marginalia.util.multimap.MultimapFileLongSlice; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -11,10 +11,10 @@ import java.io.IOException; public class BTreeWriter { private final BTreeContext ctx; - private final MultimapFileLongSlice map; + private final LongArray map; private final Logger logger = LoggerFactory.getLogger(getClass()); - public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) { + public BTreeWriter(LongArray map, BTreeContext ctx) { this.map = map; this.ctx = ctx; } @@ -42,8 +42,10 @@ public class BTreeWriter { header.write(map, offset); + final long startRange = header.dataOffsetLongs(); + final long endRange = startRange + (long) numEntries * ctx.entrySize(); - var slice = map.atOffset(header.dataOffsetLongs()); + var slice = map.range(startRange, endRange); BTreeDogEar dogEar = new BTreeDogEar(ctx, header, slice); @@ -53,13 +55,11 @@ public class BTreeWriter { logger.error("Dog ear was not overwritten: {}", header); } - if (header.layers() < 1) { // The data is too small to benefit from indexing - return ctx.calculateSize(numEntries); - } - else { + if (header.layers() >= 1) { // Omit layer if data fits within a single block writeIndex(header); - return ctx.calculateSize(numEntries); } + + return ctx.calculateSize(numEntries); } public static BTreeHeader makeHeader(BTreeContext ctx, long offset, int numEntries) { @@ -96,7 +96,8 @@ public class BTreeWriter { } - private void writeIndexLayer(BTreeHeader header, long[] layerOffsets, + private void writeIndexLayer(BTreeHeader header, + long[] layerOffsets, final long indexedDataStepSize, final int layer) { @@ -115,13 +116,20 @@ public class BTreeWriter { dataPtr += indexedDataStepSize) { long dataOffset = dataOffsetBase + (dataPtr + lastDataEntryOffset) * entrySize; - map.put(indexOffsetBase + indexWord++, map.get(dataOffset) & ctx.equalityMask()); + map.set(indexOffsetBase + indexWord++, map.get(dataOffset)); } - // Fill the remaining block with LONG_MAX - map.setRange(indexOffsetBase+indexWord, - (int) (ctx.BLOCK_SIZE_WORDS() - (indexWord % ctx.BLOCK_SIZE_WORDS())), - Long.MAX_VALUE); + // If the index block is not completely filled with data, + // top up the remaining index block with LONG_MAX + + final long trailerStart = indexOffsetBase + indexWord; + final long trailerEnd = trailerStart + + ctx.BLOCK_SIZE_WORDS() + - (int) (indexWord % ctx.BLOCK_SIZE_WORDS()); + + if (trailerStart < trailerEnd) { + map.fill(trailerStart, trailerEnd, Long.MAX_VALUE); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java index a6225db1..6c51cdde 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java @@ -1,9 +1,9 @@ package nu.marginalia.util.btree; -import nu.marginalia.util.multimap.MultimapFileLongSlice; +import nu.marginalia.util.array.LongArray; import java.io.IOException; public interface WriteCallback { - void write(MultimapFileLongSlice slice) throws IOException; + void write(LongArray slice) throws IOException; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java index 3179db70..d335d320 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java @@ -4,22 +4,28 @@ import nu.marginalia.util.btree.BTreeWriter; public record BTreeContext(int MAX_LAYERS, int entrySize, - long equalityMask, int BLOCK_SIZE_BITS, int BLOCK_SIZE_WORDS) { - public BTreeContext(int MAX_LAYERS, int entrySize, long equalityMask, int BLOCK_SIZE_BITS) { - this(MAX_LAYERS, entrySize, equalityMask, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS); + // 8 pages is the breaking point where using a B-tree is actually advantageous + // over just binary searching in a sorted list. Above 8 pages, binary search will + // worst-case four page faults. A b-tree will incur three page faults up until + // ~100k-200k entries with typical configurations. + + private static final int MIN_PAGES_FOR_BTREE = 8; + + public BTreeContext(int MAX_LAYERS, int entrySize, int BLOCK_SIZE_BITS) { + this(MAX_LAYERS, entrySize, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS); } public long calculateSize(int numEntries) { var header = BTreeWriter.makeHeader(this, 0, numEntries); - return header.dataOffsetLongs() + (long)numEntries * entrySize; + return header.dataOffsetLongs() + (long) numEntries * entrySize + 4; } public int numIndexLayers(int numEntries) { - if (numEntries <= BLOCK_SIZE_WORDS*2/entrySize) { + if (numEntries <= BLOCK_SIZE_WORDS*MIN_PAGES_FOR_BTREE/entrySize) { return 0; } for (int i = 1; i < MAX_LAYERS; i++) { @@ -37,12 +43,8 @@ public record BTreeContext(int MAX_LAYERS, public long indexLayerSize(int numWords, int level) { final long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1)); - final long numBlocks = numWords / layerSize; - if (numWords % layerSize != 0) { - return BLOCK_SIZE_WORDS * (numBlocks + 1); - } - return BLOCK_SIZE_WORDS * numBlocks; + return BLOCK_SIZE_WORDS * (numWords / layerSize + Long.signum(numWords % layerSize)); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java index 8cdcd355..a0dc3be3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java @@ -1,6 +1,6 @@ package nu.marginalia.util.btree.model; -import nu.marginalia.util.multimap.MultimapFileLongSlice; +import nu.marginalia.util.array.LongArray; public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) { public BTreeHeader { @@ -28,10 +28,10 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon return padding; } - public void write(MultimapFileLongSlice dest, long offset) { - dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL)); - dest.put(offset+1, indexOffsetLongs); - dest.put(offset+2, dataOffsetLongs); + public void write(LongArray dest, long offset) { + dest.set(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL)); + dest.set(offset+1, indexOffsetLongs); + dest.set(offset+2, dataOffsetLongs); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegex.java b/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegex.java new file mode 100644 index 00000000..7ba29096 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegex.java @@ -0,0 +1,6 @@ +package nu.marginalia.util.gregex; + +import java.util.function.Predicate; + +public interface GuardedRegex extends Predicate { +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java b/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java new file mode 100644 index 00000000..16dd6e59 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java @@ -0,0 +1,62 @@ +package nu.marginalia.util.gregex; + +import org.intellij.lang.annotations.Language; + +import java.util.regex.Pattern; + + +public class GuardedRegexFactory { + + // Regular expressions are slow, even compiled ones. Guarding them with startsWith, or even contains + // is something like an order of magnitude faster. This matters a lot in hot code. + + public static GuardedRegex startsWith(String prefix, @Language("RegExp") String regex) { + return new GuardedRegexStartsWith(prefix, regex); + } + public static GuardedRegex endsWith(String suffix, @Language("RegExp") String regex) { + return new GuardedRegexEndsWith(suffix, regex); + } + public static GuardedRegex contains(String substring, @Language("RegExp") String regex) { + return new GuardedRegexContains(substring, regex); + } + public static GuardedRegex minLength(int minLength, @Language("RegExp") String regex) { + return new GuardedRegexMinLength(minLength, regex); + } + + private record GuardedRegexContains(String contains, Pattern pattern) implements GuardedRegex { + public GuardedRegexContains(String contains, String pattern) { + this(contains, Pattern.compile(pattern)); + } + + public boolean test(String s) { + return s.contains(contains) && pattern.matcher(s).find(); + } + } + private record GuardedRegexMinLength(int minLength, Pattern pattern) implements GuardedRegex { + public GuardedRegexMinLength(int minLength, String pattern) { + this(minLength, Pattern.compile(pattern)); + } + + public boolean test(String s) { + return s.length() >= minLength && pattern.matcher(s).find(); + } + } + private record GuardedRegexStartsWith(String start, Pattern pattern) implements GuardedRegex { + public GuardedRegexStartsWith(String start, String pattern) { + this(start, Pattern.compile(pattern)); + } + + public boolean test(String s) { + return s.startsWith(start) && pattern.matcher(s).find(); + } + } + private record GuardedRegexEndsWith(String end, Pattern pattern) implements GuardedRegex { + public GuardedRegexEndsWith(String end, String pattern) { + this(end, Pattern.compile(pattern)); + } + + public boolean test(String s) { + return s.endsWith(end) && pattern.matcher(s).find(); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java deleted file mode 100644 index d1e056b9..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java +++ /dev/null @@ -1,188 +0,0 @@ -package nu.marginalia.util.hash; - -import lombok.EqualsAndHashCode; -import lombok.Getter; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.util.PrimeUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import static java.lang.Math.round; - -/** - * Spiritually influenced by GNU Trove's hash maps - * LGPL 2.1 - */ -public class LongPairHashMap { - private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class); - private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police - - private final long hashTableSize; - private final MultimapFileLong data; - private final long maxProbeLength; - private int sz = 0; - private static final int HEADER_SIZE = 2; - - private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) { - this.data = data; - this.hashTableSize = hashTableSize; - this.maxProbeLength = maxProbeLength; - } - - public static LongPairHashMap createNew(MultimapFileLong data, long size) { - var tableSize = PrimeUtil.nextPrime(size, 1); - var ret = new LongPairHashMap(data, tableSize, tableSize/2); - - data.put(0, MAGIC_WORD); - data.put(1, tableSize); - - for (int i = 2; i < tableSize; i++) { - data.put(HEADER_SIZE + 2L*i, 0); - } - - return ret; - } - - public static LongPairHashMap loadExisting(MultimapFileLong data) { - long key = data.get(0); - - if (key != MAGIC_WORD) { - logger.warn("LongPairHashMap lacks magic word, could this be garbage data?"); - } - - var hashTableSize = data.get(1); - var maxProbeLength = hashTableSize / 10; - - return new LongPairHashMap(data, hashTableSize, maxProbeLength); - } - - public int size() { - return sz; - } - - private CellData getCell(long idx) { - long bufferIdx = 2*idx + HEADER_SIZE; - long a = data.get(bufferIdx); - long b = data.get(bufferIdx+1); - return new CellData(a, b); - } - private void setCell(long idx, CellData cell) { - long bufferIdx = 2*idx + HEADER_SIZE; - data.put(bufferIdx, cell.first); - data.put(bufferIdx+1, cell.second); - } - - public CellData put(CellData data) { - - long hash = longHash(data.getKey()) & 0x7FFF_FFFFL; - - long idx = hash% hashTableSize; - if (!getCell(hash% hashTableSize).isSet()) { - return setValue(data, hash% hashTableSize); - } - - return putRehash(data, idx, hash); - - } - - private CellData putRehash(CellData data, long idx, long hash) { - final long pStride = 1 + (hash % (hashTableSize - 2)); - - for (long j = 1; j < maxProbeLength; j++) { - idx = idx - pStride; - - if (idx < 0) { - idx += hashTableSize; - } - - final var val = getCell(idx); - - if (!val.isSet()) { - return setValue(data, idx); - } - else if (val.getKey() == data.getKey()) { - logger.error("Double write?"); - return val; - } - } - - throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%, key = " + data.getKey() + ",#"+hash); - } - - private CellData setValue(CellData data, long cell) { - sz++; - - setCell(cell, data); - return data; - } - - public CellData get(int key) { - if (hashTableSize == 0) { - return new CellData(0, 0); - } - final long hash = longHash(key) & 0x7FFF_FFFFL; - - var val = getCell(hash % hashTableSize); - if (!val.isSet()) { - return val; - } - else if (val.getKey() == key) { - return val; - } - - return getRehash(key, hash % hashTableSize, hash); - } - - private CellData getRehash(int key, long idx, long hash) { - final long pStride = 1 + (hash % (hashTableSize - 2)); - - for (long j = 1; j < maxProbeLength; j++) { - idx = idx - pStride; - - if (idx < 0) { - idx += hashTableSize; - } - - final var val = getCell(idx); - - if (!val.isSet()) { - return val; - } - else if (val.getKey() == key) { - return val; - } - } - - throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%"); - } - - private long longHash(long x) { - return x; - } - - @Getter @EqualsAndHashCode - public static class CellData { - final long first; - final long second; - - public CellData(long key, long offset) { - first = key | 0x8000_0000_000_000L; - second = offset; - } - - public long getKey() { - return first & ~0x8000_0000_000_000L; - } - public long getOffset() { - return second; - } - - public boolean isSet() { - return first != 0 || second != 0L; - } - } - - public void close() throws Exception { - data.close(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 978d8b63..fddd7e28 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -6,8 +6,6 @@ import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import javax.inject.Inject; @@ -33,7 +31,7 @@ public class DocumentKeywordExtractor { } - public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { + public EdgePageWords extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { List titleWords = extractTitleWords(documentLanguageData); List wordsNamesAll = nameCounter.count(documentLanguageData, 2); @@ -47,15 +45,15 @@ public class DocumentKeywordExtractor { List artifacts = getArtifacts(documentLanguageData); - keywordMetadata.flagsTemplate().add(EdgePageWordFlags.Simple); + WordsBuilder wordsBuilder = new WordsBuilder(); - return new EdgePageWordSet( - createWords(keywordMetadata, IndexBlock.Title, titleWords), - EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts) - ); + createWords(wordsBuilder, keywordMetadata, titleWords, 0); + artifacts.forEach(wordsBuilder::addWithBlankMetadata); + + return wordsBuilder.build(); } - public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { + public EdgePageWords extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { List titleWords = extractTitleWords(documentLanguageData); @@ -72,26 +70,25 @@ public class DocumentKeywordExtractor { List artifacts = getArtifacts(documentLanguageData); - var wordSet = new EdgePageWordSet( - createWords(keywordMetadata, IndexBlock.Title, titleWords), - createWords(keywordMetadata, IndexBlock.Tfidf_High, wordsTfIdf), - createWords(keywordMetadata, IndexBlock.Subjects, subjects), - EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts) - ); + WordsBuilder wordsBuilder = new WordsBuilder(); - getSimpleWords(keywordMetadata, wordSet, documentLanguageData, - IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); + createWords(wordsBuilder, keywordMetadata, titleWords, 0); + createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit()); + createWords(wordsBuilder, keywordMetadata, subjects, 0); - return wordSet; + getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData); + + artifacts.forEach(wordsBuilder::addWithBlankMetadata); + + return wordsBuilder.build(); } public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { Map ret = keywordMetadata.positionMask(); - int posCtr = 0; for (var sent : dld.titleSentences) { - int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL); + int posBit = 1; for (var word : sent) { ret.merge(word.stemmed(), posBit, this::bitwiseOr); @@ -101,9 +98,11 @@ public class DocumentKeywordExtractor { ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } } - posCtr+=4; + + int pos = 1; + int line = 0; for (var sent : dld.sentences) { - int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL); + int posBit = (int)((1L << pos) & 0xFFFF_FFFFL); for (var word : sent) { ret.merge(word.stemmed(), posBit, this::bitwiseOr); @@ -113,7 +112,28 @@ public class DocumentKeywordExtractor { ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } - posCtr++; + if (pos < 4) pos ++; + else if (pos < 8) { + if (++line >= 2) { + pos++; + line = 0; + } + } + else if (pos < 24) { + if (++line >= 4) { + pos++; + line = 0; + } + } + else if (pos < 64) { + if (++line > 8) { + pos++; + line = 0; + } + } + else { + break; + } } } @@ -122,43 +142,32 @@ public class DocumentKeywordExtractor { } - private void getSimpleWords(KeywordMetadata metadata, EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) { + private void getSimpleWords(WordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) { EnumSet flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class); - int start = 0; - int lengthGoal = 32; + for (var sent : documentLanguageData.sentences) { - for (int blockIdx = 0; blockIdx < blocks.length && start < documentLanguageData.sentences.length; blockIdx++) { - IndexBlock block = blocks[blockIdx]; - Set words = new HashSet<>(lengthGoal+100); + if (wordsBuilder.size() > 1500) + break; - int pos; - int length = 0; - for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) { - var sent = documentLanguageData.sentences[pos]; - length += sent.length(); - - for (var word : sent) { - if (!word.isStopWord()) { - String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); - if (WordPatterns.singleWordQualitiesPredicate.test(w)) { - words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, word.stemmed()))); - } + for (var word : sent) { + if (!word.isStopWord()) { + String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); + if (WordPatterns.singleWordQualitiesPredicate.test(w)) { + wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed())); } } - - for (var names : keywordExtractor.getNames(sent)) { - var rep = new WordRep(sent, names); - String w = AsciiFlattener.flattenUnicode(rep.word); - - words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, rep.stemmed))); - } } - wordSet.append(block, words); - start = pos; - lengthGoal+=32; + + for (var names : keywordExtractor.getNames(sent)) { + var rep = new WordRep(sent, names); + String w = AsciiFlattener.flattenUnicode(rep.word); + + wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed)); + } } + } private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+"); @@ -197,11 +206,11 @@ public class DocumentKeywordExtractor { .collect(Collectors.toList()); } - public EdgePageWords createWords(KeywordMetadata metadata, - IndexBlock block, - Collection words) { + public void createWords(WordsBuilder wordsBuilder, + KeywordMetadata metadata, + Collection words, + long additionalMeta) { - Set entries = new HashSet<>(words.size()); for (var word : words) { String flatWord = AsciiFlattener.flattenUnicode(word.word); @@ -209,9 +218,31 @@ public class DocumentKeywordExtractor { continue; } - entries.add(new EdgePageWords.Entry(flatWord, metadata.forWord(metadata.flagsTemplate(), word.stemmed))); + wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta); + } + } + + private static class WordsBuilder { + private final EdgePageWords words = new EdgePageWords(1600); + private final Set seen = new HashSet<>(1600); + + public void add(String word, long meta) { + if (seen.add(word)) { + words.add(word, meta); + } + } + public void addWithBlankMetadata(String word) { + if (seen.add(word)) { + words.addJustNoMeta(word); + } } - return new EdgePageWords(block, entries); + public EdgePageWords build() { + return words; + } + + public int size() { + return seen.size(); + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java index 5bee1a5d..fa24cbcd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java @@ -1,7 +1,7 @@ package nu.marginalia.util.language.processing; import com.github.jknack.handlebars.internal.lang3.StringUtils; -import gnu.trove.map.hash.TObjectIntHashMap; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.KeywordMetadata; @@ -27,7 +27,7 @@ public class KeywordCounter { } public List countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { - TObjectIntHashMap counts = new TObjectIntHashMap<>(10_000, 0.7f); + Object2IntOpenHashMap counts = new Object2IntOpenHashMap<>(10_000, 0.7f); HashMap> instances = new HashMap<>(15000); @@ -41,7 +41,8 @@ public class KeywordCounter { var rep = new WordRep(sent, span); - counts.adjustOrPutValue(rep.stemmed, 1, 1); + counts.mergeInt(rep.stemmed, 1, Integer::sum); + var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500)); if (instanceSet.size() < 250) { instanceSet.add(rep); @@ -54,7 +55,8 @@ public class KeywordCounter { int maxVal = maxValue(counts); - counts.forEachEntry((key, cnt) -> { + + counts.forEach((key, cnt) -> { int value = getTermValue(key, cnt, maxVal); tfIdf.put(key, new WordFrequencyData(cnt, value)); @@ -62,18 +64,18 @@ public class KeywordCounter { if (cnt > 1 && value > 100) { tfIdfHigh.addAll(instances.get(key)); } - - return true; }); return tfIdfHigh; } - private int maxValue(TObjectIntHashMap map) { + private int maxValue(Object2IntOpenHashMap map) { int maxC = 0; + for (int c : map.values()) { maxC = max(c, maxC); } + return maxC; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java index 08c586e2..7e56830e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java @@ -32,7 +32,9 @@ public class KeywordExtractor { if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } - if (isProperNoun(i, sentence) && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) && isProperNoun(i-2, sentence)) + if (isProperNoun(i, sentence) + && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) + && isProperNoun(i-2, sentence)) spans.add(new WordSpan(i-2, i+1)); } @@ -42,59 +44,91 @@ public class KeywordExtractor { if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) { - if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) { + if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) spans.add(new WordSpan(i-3, i+1)); - } - else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT")) { + else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT")) spans.add(new WordSpan(i-3, i+1)); - } - else if ((isJoiner(sentence, i-1)||isProperNoun(i - 1, sentence)) && (isJoiner(sentence, i-2)||isProperNoun(i - 2, sentence))) { + else if ((isJoiner(sentence, i-1) ||isProperNoun(i-1, sentence)) + && (isJoiner(sentence, i-2)||isProperNoun(i-2, sentence))) spans.add(new WordSpan(i-3, i+1)); - } } } return spans.toArray(WordSpan[]::new); } - public WordSpan[] getNamesStrict(DocumentSentence sentence) { + public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) { + if (sentence.keywords != null) { + return sentence.keywords.get(); + } List spans = new ArrayList<>(sentence.length()); + Set topWords = Collections.emptySet(); for (int i = 0; i < sentence.length(); i++) { - if (isProperNoun(i, sentence)) + if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords)) spans.add(new WordSpan(i, i+1)); } for (int i = 1; i < sentence.length(); i++) { if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } - if (isProperNoun(i, sentence) && isProperNoun(i-1, sentence)) - spans.add(new WordSpan(i-1, i+1)); + + if (isName(i, sentence, topWords)) { + if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) + spans.add(new WordSpan(i - 1, i + 1)); + } + if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) { + spans.add(new WordSpan(i - 1, i + 1)); + } } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } - if (isProperNoun(i, sentence) && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) && isProperNoun(i-2, sentence)) - spans.add(new WordSpan(i-2, i+1)); + if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } + + if (isName(i, sentence, topWords)) { + if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) + && (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) { + spans.add(new WordSpan(i - 2, i + 1)); + } + else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) { + spans.add(new WordSpan(i - 2, i + 1)); + } + } + else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) { + spans.add(new WordSpan(i - 2, i + 1)); + } } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } + if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } + if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) { + if (isName(i, sentence, topWords) && + (isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) && + (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) && + (isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) { + spans.add(new WordSpan(i - 3, i + 1)); + } + else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) { if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) { spans.add(new WordSpan(i-3, i+1)); } else if (isJoiner(sentence, i-1) && sentence.posTags[i-2].equals("DT")) { spans.add(new WordSpan(i-3, i+1)); } + else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && (isProperNoun(i-2, sentence)|| isJoiner(sentence, i-2))) { + spans.add(new WordSpan(i-3, i + 1)); + } } + } - return spans.toArray(WordSpan[]::new); + var ret = spans.toArray(WordSpan[]::new); + sentence.keywords = new SoftReference<>(ret); + + return ret; } public boolean isProperNoun(int i, DocumentSentence sent) { @@ -149,139 +183,6 @@ public class KeywordExtractor { return true; } - public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) { - if (sentence.keywords != null) { - return sentence.keywords.get(); - } - List spans = new ArrayList<>(sentence.length()); - - Set topWords = Collections.emptySet(); - - for (int i = 0; i < sentence.length(); i++) { - if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords)) - spans.add(new WordSpan(i, i+1)); - } - - for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } - - if (isName(i, sentence, topWords)) { - if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) - spans.add(new WordSpan(i - 1, i + 1)); - } - if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) { - spans.add(new WordSpan(i - 1, i + 1)); - } - } - - for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - - if (isName(i, sentence, topWords)) { - if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) - && (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) { - spans.add(new WordSpan(i - 2, i + 1)); - } - else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) { - spans.add(new WordSpan(i - 2, i + 1)); - } - } - else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) { - spans.add(new WordSpan(i - 2, i + 1)); - } - } - - for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - - if (isName(i, sentence, topWords) && - (isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) && - (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) && - (isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) { - spans.add(new WordSpan(i - 3, i + 1)); - } - else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) { - if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) { - spans.add(new WordSpan(i-3, i+1)); - } - else if (isJoiner(sentence, i-1) && sentence.posTags[i-2].equals("DT")) { - spans.add(new WordSpan(i-3, i+1)); - } - else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && (isProperNoun(i-2, sentence)|| isJoiner(sentence, i-2))) { - spans.add(new WordSpan(i-3, i + 1)); - } - } - - } - - var ret = spans.toArray(WordSpan[]::new); - sentence.keywords = new SoftReference<>(ret); - - return ret; - } - - public WordSpan[] getKeywordsFromSentenceStrict(DocumentSentence sentence, Set topWords, boolean reducePartials) { - List spans = new ArrayList<>(sentence.length()); - - if (!reducePartials) { - for (int i = 0; i < sentence.length(); i++) { - if (topWords.contains(sentence.stemmedWords[i])) - spans.add(new WordSpan(i, i + 1)); - } - } - - for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } - - if (topWords.contains(sentence.stemmedWords[i]) - && !sentence.words[i].endsWith("'s") - && topWords.contains(sentence.stemmedWords[i-1])) { - spans.add(new WordSpan(i-1, i + 1)); - } - } - for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } - - if (topWords.contains(sentence.stemmedWords[i]) - && !sentence.words[i].endsWith("'s") - && (topWords.contains(sentence.stemmedWords[i-1]) || isJoiner(sentence, i-1)) - && topWords.contains(sentence.stemmedWords[i-2]) - ) { - spans.add(new WordSpan(i-2, i + 1)); - } - } - - for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } - if (!sentence.words[i-2].endsWith("'s")) { continue; } - if (!sentence.words[i-3].endsWith("'s")) { continue; } - - if (topWords.contains(sentence.stemmedWords[i]) - && !sentence.words[i].endsWith("'s") && topWords.contains(sentence.stemmedWords[i-3])) { - if (topWords.contains(sentence.stemmedWords[i-1]) && topWords.contains(sentence.stemmedWords[i-2])) { - spans.add(new WordSpan(i-3, i + 1)); - } - else if (topWords.contains(sentence.stemmedWords[i-1]) && isJoiner(sentence, i-2)) { - spans.add(new WordSpan(i-3, i + 1)); - } - else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT")) { - spans.add(new WordSpan(i-3, i + 1)); - } - else if (isJoiner(sentence, i-2) && isJoiner(sentence, i-1)) { - spans.add(new WordSpan(i-3, i + 1)); - } - } - } - - return spans.toArray(WordSpan[]::new); - } - private boolean isName(int i, DocumentSentence sentence, Set topWords) { if (!topWords.isEmpty()) { String posTag = sentence.posTags[i]; @@ -293,7 +194,6 @@ public class KeywordExtractor { String posTag = sentence.posTags[i]; -// if (posTag.startsWith("N") || posTag.startsWith("V") || posTag.startsWith("R") || posTag.startsWith("J")) return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java index ea071bf3..08886928 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java @@ -98,7 +98,6 @@ public class SentenceExtractor { } } - TObjectIntHashMap counts = calculateWordCounts(textSentences); return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java index a8b093f6..1018b5cf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java @@ -13,23 +13,22 @@ public record KeywordMetadata(HashSet titleKeywords, HashSet namesKeywords, HashMap wordsTfIdf, HashMap positionMask, - EnumSet flagsTemplate, - int quality + EnumSet wordFlagsTemplate ) { private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0); + private static final int TF_IDF_HIGH_LIMIT = 64; - public KeywordMetadata(double quality, EnumSet flags) { + public KeywordMetadata(EnumSet flags) { this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50), new HashMap<>(15_000), new HashMap<>(10_000), - flags, - (int)(-quality)); + flags); } - public KeywordMetadata(double quality) { - this(quality, EnumSet.noneOf(EdgePageWordFlags.class)); + public KeywordMetadata() { + this(EnumSet.noneOf(EdgePageWordFlags.class)); } public long forWord(EnumSet flagsTemplate, String stemmed) { @@ -48,11 +47,7 @@ public record KeywordMetadata(HashSet titleKeywords, int positions = positionMask.getOrDefault(stemmed, 0); - return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, quality, tfidf.count(), flags).encode(); - } - - public int quality() { - return -quality; + return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, tfidf.count(), flags).encode(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java deleted file mode 100644 index a43e1694..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ /dev/null @@ -1,859 +0,0 @@ -package nu.marginalia.util.multimap; - -import com.upserve.uppend.blobs.NativeIO; -import lombok.SneakyThrows; -import nu.marginalia.util.btree.BTreeQueryBuffer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.LongBuffer; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; - -import static java.nio.channels.FileChannel.MapMode.READ_ONLY; -import static java.nio.channels.FileChannel.MapMode.READ_WRITE; -import static nu.marginalia.util.FileSizeUtil.readableSize; - - -public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { - - private final ArrayList buffers = new ArrayList<>(); - private final ArrayList mappedByteBuffers = new ArrayList<>(); - private final FileChannel.MapMode mode; - private final int bufferSize; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final FileChannel channel; - - private final long mapSize; - private final long fileLength; - private long mappedSize; - final static long WORD_SIZE = 8; - - private NativeIO.Advice defaultAdvice = null; - - public static MultimapFileLong forReading(Path file) throws IOException { - long fileSize = Files.size(file); - int bufferSize = getBufferSize(fileSize, false); - - return new MultimapFileLong(file.toFile(), READ_ONLY, Files.size(file), bufferSize); - } - - public static MultimapFileLong forOutput(Path file, long estimatedSize) throws IOException { - return new MultimapFileLong(file.toFile(), READ_WRITE, 0, getBufferSize(estimatedSize, true)); - } - - private static int getBufferSize(long totalSize, boolean write) { - int defaultBig = 2<<23; - if (totalSize > Integer.MAX_VALUE/WORD_SIZE) { - return defaultBig; - } - else if (write && totalSize < 8*1024*1024) { - return 8*1024*1024; - } - else { - return (int) Math.min(totalSize, defaultBig); - } - } - - - public MultimapFileLong(File file, - FileChannel.MapMode mode, - long mapSize, - int bufferSize) throws IOException { - - this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize); - } - - private static String translateToRAFMode(FileChannel.MapMode mode) { - if (READ_ONLY.equals(mode)) { - return "r"; - } else if (READ_WRITE.equals(mode)) { - return "rw"; - } - return "rw"; - } - - - public MultimapFileLong(RandomAccessFile file, - FileChannel.MapMode mode, - long mapSizeBytes, - int bufferSizeWords) throws IOException { - this.mode = mode; - this.bufferSize = bufferSizeWords; - this.mapSize = mapSizeBytes; - this.fileLength = file.length(); - - channel = file.getChannel(); - mappedSize = 0; - - logger.trace("Creating multimap file size = {} / buffer size = {}, mode = {}", - readableSize(mapSizeBytes), readableSize(8L*bufferSizeWords), mode); - } - - public MultimapSearcherBase createSearcher() { - return new MultimapSearcherBase(this); - } - public MultimapSorter createSorter(Path tmpFile, int internalSortLimit, int minStepSize) { - return new MultimapSorter(this, tmpFile, internalSortLimit, minStepSize); - } - - @SneakyThrows - public void advice(NativeIO.Advice advice) { - this.defaultAdvice = advice; - for (var buffer : mappedByteBuffers) { - NativeIO.madvise(buffer, advice); - } - } - - @SneakyThrows - public void advice0(NativeIO.Advice advice) { - NativeIO.madvise(mappedByteBuffers.get(0), advice); - } - - @SneakyThrows - public void adviceRange(NativeIO.Advice advice, long startLongs, long lengthLongs) { - long endLongs = (startLongs+lengthLongs); - - if (endLongs >= mappedSize) - grow(endLongs); - - - int startIdx = (int)(startLongs / bufferSize); - int endIdx = (int)(endLongs / bufferSize); - - if (startIdx != endIdx) { - long offsetStart = (startLongs % bufferSize) * WORD_SIZE; - NativeIO.madviseRange(mappedByteBuffers.get(startIdx), advice, offsetStart, (int) (bufferSize * WORD_SIZE - offsetStart)); - for (int i = startIdx+1; i < endIdx; i++) { - NativeIO.madviseRange(mappedByteBuffers.get(i), advice, 0, (int)(bufferSize * WORD_SIZE)); - } - NativeIO.madviseRange(mappedByteBuffers.get(endIdx), advice, 0, (int)((endIdx % bufferSize) * WORD_SIZE)); - } - else { - var buff = mappedByteBuffers.get(startIdx); - NativeIO.madviseRange(buff, advice, (startLongs % bufferSize) * WORD_SIZE, (int) (lengthLongs * WORD_SIZE)); - } - } - - public void pokeRange(long offset, long length) { - for (long i = 0; i < length; i += 4096/8) { - get(offset + i); - } - } - - public void force() { - logger.trace("Forcing"); - - for (MappedByteBuffer buffer: mappedByteBuffers) { - buffer.force(); - } - } - - @SneakyThrows - public void grow(long posIdxRequired) { - if (posIdxRequired*WORD_SIZE > mapSize && mode == READ_ONLY) { - throw new IndexOutOfBoundsException(posIdxRequired + " (max " + mapSize + ")"); - } - logger.trace("Growing to encompass {}i/{}b", posIdxRequired, posIdxRequired*WORD_SIZE); - long start; - if (buffers.isEmpty()) { - start = 0; - } - else { - start = (long) buffers.size() * bufferSize; - } - for (long posIdx = start; posIdxRequired >= posIdx; posIdx += bufferSize) { - long posBytes = posIdx * WORD_SIZE; - long bzBytes; - if (mode == READ_ONLY) { - bzBytes = Math.min(WORD_SIZE*bufferSize, mapSize - posBytes); - } - else { - bzBytes = WORD_SIZE*bufferSize; - } - logger.trace("Allocating {}-{}", posBytes, posBytes+bzBytes); - - var buffer = channel.map(mode, posBytes, bzBytes); - - if (defaultAdvice != null) { - NativeIO.madvise(buffer, defaultAdvice); - } - - buffers.add(buffer.asLongBuffer()); - mappedByteBuffers.add(buffer); - - mappedSize += bzBytes/WORD_SIZE; - } - } - - @Override - public long size() { - return fileLength; - } - - @Override - public void put(long idx, long val) { - if (idx >= mappedSize) - grow(idx); - - try { - buffers.get((int)(idx / bufferSize)).put((int) (idx % bufferSize), val); - } - catch (IndexOutOfBoundsException ex) { - logger.error("Index out of bounds {} -> {}:{} cap {}", idx, buffers.get((int)(idx / bufferSize)), idx % bufferSize, - buffers.get((int)(idx / bufferSize)).capacity()); - throw new RuntimeException(ex); - } - } - - @Override - public long get(long idx) { - if (idx < 0) - throw new IllegalArgumentException("get("+idx+")"); - - if (idx >= mappedSize) - grow(idx); - - try { - return buffers.get((int)(idx / bufferSize)).get((int)(idx % bufferSize)); - } - catch (IndexOutOfBoundsException ex) { - logger.error("Index out of bounds {} -> {}:{} cap {}", idx, buffers.get((int)(idx / bufferSize)), idx % bufferSize, - buffers.get((int)(idx / bufferSize)).capacity()); - throw new RuntimeException(ex); - } - } - - - @Override - public void read(long[] vals, long idx) { - read(vals, vals.length, idx); - } - - @Override - public void read(long[] vals, int n, long idx) { - if (idx+n >= mappedSize) { - grow(idx+n); - } - - int iN = (int)((idx + n) / bufferSize); - - for (int i = 0; i < n; ) { - int i0 = (int)((idx + i) / bufferSize); - int bufferOffset = (int) ((idx+i) % bufferSize); - var buffer = buffers.get(i0); - - final int l; - - if (i0 < iN) l = bufferSize - bufferOffset; - else l = Math.min(n - i, bufferSize - bufferOffset); - - buffer.get(bufferOffset, vals, i, l); - i+=l; - - } - - } - - @Override - public void read(LongBuffer vals, long idx) { - int n = vals.limit() - vals.position(); - if (idx+n >= mappedSize) { - grow(idx+n); - } - int iN = (int)((idx + n) / bufferSize); - - for (int i = 0; i < n; ) { - int i0 = (int)((idx + i) / bufferSize); - - int bufferOffset = (int) ((idx+i) % bufferSize); - var buffer = buffers.get(i0); - - final int l; - - if (i0 < iN) l = bufferSize - bufferOffset; - else l = Math.min(n - i, bufferSize - bufferOffset); - - vals.put(vals.position() + i, buffer, bufferOffset, l); - i+=l; - } - - } - - - @Override - public void write(long[] vals, long idx) { - write(vals, vals.length, idx); - } - - @Override - public void write(long[] vals, int n, long idx) { - if (idx+n >= mappedSize) { - grow(idx+n); - } - - int iN = (int)((idx + n) / bufferSize); - - for (int i = 0; i < n; ) { - int i0 = (int)((idx + i) / bufferSize); - int bufferOffset = (int) ((idx+i) % bufferSize); - var buffer = buffers.get(i0); - - final int l; - - if (i0 < iN) l = bufferSize - bufferOffset; - else l = Math.min(n - i, bufferSize - bufferOffset); - - buffer.put(bufferOffset, vals, i, l); - i+=l; - - } - - } - - @Override - public void write(LongBuffer vals, long idx) { - int n = vals.limit() - vals.position(); - if (idx+n >= mappedSize) { - grow(idx+n); - } - int iN = (int)((idx + n) / bufferSize); - - for (int i = 0; i < n; ) { - int i0 = (int)((idx + i) / bufferSize); - - int bufferOffset = (int) ((idx+i) % bufferSize); - var buffer = buffers.get(i0); - - final int l; - - if (i0 < iN) l = bufferSize - bufferOffset; - else l = Math.min(n - i, bufferSize - bufferOffset); - - buffer.put(bufferOffset, vals, vals.position() + i, l); - i+=l; - } - - } - - - @Override - public void write(LongBuffer vals, int n, long idx) { - if (idx+n >= mappedSize) { - grow(idx+n); - } - int iN = (int)((idx + n) / bufferSize); - - for (int i = 0; i < n; ) { - int i0 = (int)((idx + i) / bufferSize); - - int bufferOffset = (int) ((idx+i) % bufferSize); - var buffer = buffers.get(i0); - - final int l; - - if (i0 < iN) l = bufferSize - bufferOffset; - else l = Math.min(n - i, bufferSize - bufferOffset); - - buffer.put(bufferOffset, vals, vals.position() + i, l); - i+=l; - } - - } - - @Override - public void swapn(int n, long idx1, long idx2) { - for (int i = 0; i < n; i++) - swap(idx1+i, idx2+i); - } - - private void swap(long idx1, long idx2) { - LongBuffer buff1 = buffers.get((int)(idx1 / bufferSize)); - final int o1 = (int) (idx1 % bufferSize); - - LongBuffer buff2 = buffers.get((int)(idx2 / bufferSize)); - final int o2 = (int) (idx2 % bufferSize); - - long tmp = buff1.get(o1); - buff1.put(o1, buff2.get(o2)); - buff2.put(o2, tmp); - } - - @Override - public void setRange(long idx, int n, long val) { - if (n == 0) return; - - if (idx+n >= mappedSize) { - grow(idx+n); - } - int iN = (int)((idx + n) / bufferSize); - - for (int i = 0; i < n; ) { - int i0 = (int)((idx + i) / bufferSize); - - int bufferOffset = (int) ((idx+i) % bufferSize); - var buffer = buffers.get(i0); - - final int l; - - if (i0 < iN) l = bufferSize - bufferOffset; - else l = Math.min(n - i, bufferSize - bufferOffset); - - for (int p = 0; p < l; p++) { - buffer.put(bufferOffset + p, val); - } - - i+=l; - } - } - - - @Override - public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException { - - int length = (int)(sourceEnd - sourceStart); - - if (destOffset+length >= mappedSize) { - grow(destOffset+length); - } - - int i0 = (int)((destOffset) / bufferSize); - int iN = (int)((destOffset + length) / bufferSize); - - int numBuffers = iN - i0 + 1; - ByteBuffer[] buffers = new ByteBuffer[numBuffers]; - for (int i = 0; i < numBuffers; i++) { - buffers[i] = mappedByteBuffers.get(i0 + i); - buffers[i].clear(); - } - if (i0 != iN) { - int startBuf0 = (int) ((destOffset) % bufferSize) * 8; - int endBuf0 = buffers[0].capacity() - (int) (destOffset % bufferSize) * 8; - int endBufN = (int)((destOffset + length) % bufferSize)*8; - buffers[0] = buffers[0].slice(startBuf0, endBuf0); - buffers[numBuffers-1] = buffers[numBuffers-1].slice(0, endBufN); - } - else { - buffers[0] = buffers[0].slice((int) (destOffset % bufferSize) * 8, 8*length); - } - - sourceChannel.position(sourceStart*8); - - long twb = 0; - while (twb < length * 8L) { - long rb = sourceChannel.read(buffers, 0, buffers.length); - if (rb < 0) - throw new IOException(); - twb += rb; - } - - } - - @Override - public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) { - if (fromIndex + n*step >= mappedSize) - grow(fromIndex + n*step); - - long low = 0; - long high = n - 1; - - if (isSameBuffer(fromIndex, fromIndex+step*n)) { - int idx = (int)(fromIndex / bufferSize); - var buffer = buffers.get(idx); - - while (low <= high) { - long mid = (low + high) >>> 1; - long off = fromIndex + mid*step; - long midVal = buffer.get((int)(off % bufferSize)) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid*step; - } - } - else { - while (low <= high) { - long mid = (low + high) >>> 1; - long off = fromIndex + mid*step; - long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid*step; - } - } - - return -1L-(fromIndex + high*step); - } - - @Override - public long binarySearchInternal(long key, long fromIndex, long n, long mask) { - if (fromIndex + n >= mappedSize) - grow(fromIndex + n); - - long low = 0; - long high = n - 1; - - if (isSameBuffer(fromIndex, fromIndex+n)) { - int idx = (int)(fromIndex / bufferSize); - var buffer = buffers.get(idx); - - while (low <= high) { - long mid = (low + high) >>> 1; - long off = fromIndex + mid; - long midVal = buffer.get((int)(off % bufferSize)) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - } - else { - while (low <= high) { - long mid = (low + high) >>> 1; - long off = fromIndex + mid; - long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - } - - return -1L-(fromIndex + high); - } - - - - @Override - public long binarySearchInternal(long key, long fromIndex, long n) { - if (fromIndex + n >= mappedSize) - grow(fromIndex + n); - - long low = 0; - long high = n - 1; - - if (isSameBuffer(fromIndex, fromIndex+n)) { - int idx = (int)(fromIndex / bufferSize); - var buffer = buffers.get(idx); - - - while (low <= high) { - long mid = (low + high) >>> 1; - long off = fromIndex + mid; - long midVal = buffer.get((int)(off % bufferSize)); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - } - else { - while (low <= high) { - long mid = (low + high) >>> 1; - long off = fromIndex + mid; - long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - } - - return -1L-(fromIndex + high); - } - - - @Override - public long binarySearchUpperInternal(long key, long fromIndex, long n) { - if (fromIndex + n >= mappedSize) - grow(fromIndex + n); - - long low = 0; - long high = n - 1; - - if (isSameBuffer(fromIndex, fromIndex+n)) { - int idx = (int)(fromIndex / bufferSize); - var buffer = buffers.get(idx); - - - while (low <= high) { - long mid = (low + high) >>> 1; - long off = fromIndex + mid; - long midVal = buffer.get((int)(off % bufferSize)); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - } - else { - while (low <= high) { - long mid = (low + high) >>> 1; - long off = fromIndex + mid; - long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - } - - return fromIndex + low; - } - - private boolean isSameBuffer(long a, long b) { - return a / bufferSize == b/bufferSize; - } - - @Override - public long quickSortPartition(int wordSize, long low, long high) { - if (high >= mappedSize) - grow(high + wordSize - 1); - - if (isSameBuffer(low, high + wordSize - 1)) { - // Specialization that circumvents the need for expensive calls to - // MultimapFileLong.get() in the most common scenario - - return quickSortPartitionSameBuffer(wordSize, low, high); - } - else { - return quickSortPartitionDifferentBuffers(wordSize, low, high); - } - } - - @Override - public void insertionSort(int wordSize, long start, int n) { - if (start + n + wordSize - 1 >= mappedSize) - grow(start + n + wordSize - 1); - - if (n <= 1) { - return; - } - - if (isSameBuffer(start, start + (long)n*wordSize-1L)) { - final var buffer = buffers.get((int) (start / bufferSize)); - int off = (int) (start % bufferSize); - - for (int i = 1; i < n; i++) { - long key = buffer.get(off + i * wordSize); - - int j = i - 1; - while (j >= 0 && buffer.get(off + wordSize*j) > key) { - for (int w = 0; w < wordSize; w++) { - long tmp = buffer.get(off+wordSize*j+w); - buffer.put(off+wordSize*j+w, buffer.get(off+wordSize*(j+1)+w)); - buffer.put(off+wordSize*(j+1)+w, tmp); - } - j--; - } - buffer.put(off + (j+1) * wordSize, key); - } - } - else for (int i = 1; i < n; i++) { - long key = get(start + (long) i * wordSize); - - int j = i - 1; - while (j >= 0 && get(start + (long)wordSize*j) > key) { - swapn(wordSize, start + (long)wordSize*j, start + (long)wordSize*(j+1)); - j--; - } - put(start + (long) (j+1) * wordSize, key); - } - } - - - private long quickSortPartitionDifferentBuffers(int wordSize, long low, long high) { - - long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize; - long pivot = get(pivotPoint); - - long i = low - wordSize; - long j = high + wordSize; - - for (;;) { - do { - i+=wordSize; - } while (get(i) < pivot); - - do { - j-=wordSize; - } - while (get(j) > pivot); - - if (i >= j) return j; - else swapn(wordSize, i, j); - } - } - - private long quickSortPartitionSameBuffer(int wordSize, long low, long high) { - - final var buffer = buffers.get((int) (low / bufferSize)); - - final long pivotPointLong = ((low + high) / (2L*wordSize)) * wordSize; - final int pivotPoint = (int) (pivotPointLong % bufferSize); - - final long pivot = buffer.get(pivotPoint); - - int j = (int) (high % bufferSize) + wordSize; - int i = (int) (low % bufferSize) - wordSize; - - long j0 = high + wordSize - j; - - for (;;) { - do { - i+=wordSize; - } while (buffer.get(i) < pivot); - - do { - j-=wordSize; - } - while (buffer.get(j) > pivot); - - if (i >= j) return j0 + j; - else { - for (int w = 0; w < wordSize; w++) { - long tmp = buffer.get(i+w); - buffer.put(i+w, buffer.get(j+w)); - buffer.put(j+w, tmp); - } - } - } - } - - - - public void retain(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) { - - final long end = searchStart + stepSize * numEntries; - if (end < mappedSize) { - grow(end); - } - - long bv = buffer.currentValue() & mask; - long av = get(searchStart) & mask; - long pos = searchStart; - - int bi = (int)(searchStart / bufferSize); - int bo = (int)(searchStart % bufferSize); - - LongBuffer data = buffers.get(bi); - - while (bv <= boundary && buffer.hasMore()) { - if (bv < av) { - if (!buffer.rejectAndAdvance()) break; - bv = buffer.currentValue() & mask; - continue; - } - else if (bv == av) { - if (!buffer.retainAndAdvance()) break; - bv = buffer.currentValue() & mask; - continue; - } - - pos += stepSize; - if (pos < end) { - bo += stepSize; - if (bo >= bufferSize) { - data = buffers.get(++bi); - bo = 0; - } - av = data.get(bo) & mask; - } - else { - break; - } - } - - } - - public void reject(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) { - - final long end = searchStart + stepSize * numEntries; - if (end < mappedSize) { - grow(end); - } - - long bv = buffer.currentValue() & mask; - long av = get(searchStart) & mask; - long pos = searchStart; - - int bi = (int)(searchStart / bufferSize); - int bo = (int)(searchStart % bufferSize); - - LongBuffer data = buffers.get(bi); - - while (bv <= boundary && buffer.hasMore()) { - if (bv < av) { - if (!buffer.retainAndAdvance()) break; - bv = buffer.currentValue() & mask; - continue; - } - else if (bv == av) { - if (!buffer.rejectAndAdvance()) break; - bv = buffer.currentValue() & mask; - continue; - } - - pos += stepSize; - if (pos < end) { - bo += stepSize; - if (bo >= bufferSize) { - data = buffers.get(++bi); - bo = 0; - } - av = data.get(bo) & mask; - } - else { - break; - } - } - - } - - @Override - public void close() throws IOException { - force(); - - mappedByteBuffers.clear(); - buffers.clear(); - - channel.close(); - - // I want to believe - System.runFinalization(); - System.gc(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java deleted file mode 100644 index d7724d79..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java +++ /dev/null @@ -1,120 +0,0 @@ -package nu.marginalia.util.multimap; - -import java.io.IOException; -import java.nio.LongBuffer; -import java.nio.channels.FileChannel; - -public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice { - private final long off; - private final MultimapFileLongSlice map; - - public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) { - this.off = off; - this.map = map; - } - - @Override - public long size() { - return map.size() - off; - } - - @Override - public void put(long idx, long val) { - map.put(off+idx, val); - } - - @Override - public void setRange(long idx, int n, long val) { - map.setRange(off+idx, n, val); - } - - @Override - public long get(long idx) { - return map.get(off+idx); - } - - @Override - public void read(long[] vals, long idx) { - map.read(vals, idx+off); - } - - @Override - public void read(long[] vals, int n, long idx) { - map.read(vals, n, idx+off); - } - - @Override - public void read(LongBuffer vals, long idx) { map.read(vals, idx+off); } - - @Override - public void write(long[] vals, long idx) { - map.write(vals, idx+off); - } - - @Override - public void write(long[] vals, int n, long idx) { - map.write(vals, n, idx+off); - } - - @Override - public void write(LongBuffer vals, long idx) { - map.write(vals, idx+off); - } - - @Override - public void write(LongBuffer vals, int n, long idx) { - map.write(vals, n,idx+off); - } - - @Override - public void swapn(int n, long idx1, long idx2) { - map.swapn(n, idx1+off, idx2+off); - } - - - @Override - public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) - throws IOException { - map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd); - } - - @Override - public MultimapFileLongSlice atOffset(long off) { - // If we don't override this, the default implementation would build a pyramid of - // MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...))) - // if this is called iteratively (e.g. to walk over a file) - - return new MultimapFileLongOffsetSlice(map, this.off + off); - } - - @Override - public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) { - throw new UnsupportedOperationException(); - } - - @Override - public long binarySearchInternal(long key, long fromIndex, long n, long mask) { - throw new UnsupportedOperationException(); - } - - @Override - public long binarySearchInternal(long key, long fromIndex, long n) { - throw new UnsupportedOperationException(); - } - - @Override - public long binarySearchUpperInternal(long key, long fromIndex, long n) { - throw new UnsupportedOperationException(); - - } - - @Override - public long quickSortPartition(int wordSize, long low, long highInclusive) { - return map.quickSortPartition(wordSize, low+off, highInclusive+off); - } - - @Override - public void insertionSort(int wordSize, long start, int n) { - map.insertionSort(wordSize, start+off, n); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java deleted file mode 100644 index 14f43169..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java +++ /dev/null @@ -1,47 +0,0 @@ -package nu.marginalia.util.multimap; - -import java.io.IOException; -import java.nio.LongBuffer; -import java.nio.channels.FileChannel; - -public interface MultimapFileLongSlice { - long size(); - - void put(long idx, long val); - - void setRange(long idx, int n, long val); - - long get(long idx); - - void read(long[] vals, long idx); - - void read(long[] vals, int n, long idx); - - void read(LongBuffer vals, long idx); - - void write(long[] vals, long idx); - - void write(long[] vals, int n, long idx); - - void write(LongBuffer vals, long idx); - - void write(LongBuffer vals, int n, long idx); - - void swapn(int n, long idx1, long idx2); - - void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException; - - default MultimapFileLongSlice atOffset(long off) { - return new MultimapFileLongOffsetSlice(this, off); - } - long binarySearchInternal(long key, long fromIndex, int step, long n, long mask); - long binarySearchInternal(long key, long fromIndex, long n, long mask); - - long binarySearchInternal(long key, long fromIndex, long n); - - long binarySearchUpperInternal(long key, long fromIndex, long n); - - long quickSortPartition(int wordSize, long low, long highInclusive); - - void insertionSort(int wordSize, long start, int n); -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java deleted file mode 100644 index cc7d5a13..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java +++ /dev/null @@ -1,80 +0,0 @@ -package nu.marginalia.util.multimap; - -public interface MultimapSearcher { - long binarySearchLower(long key, long fromIndex, long n); - long binarySearch(long key, long fromIndex, long n); - - static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) { - if (mask == ~0L && stepSize == 1) { - return new SimpleMultimapSearcher(new MultimapSearcherBase(slice)); - } - else if (stepSize == 1) { - return new MaskedMultimapSearcher(new MultimapSearcherBase(slice), mask); - } - else { - return new SteppingMaskedMultimapSearcher(new MultimapSearcherBase(slice), mask, stepSize); - } - } -} - -class SimpleMultimapSearcher implements MultimapSearcher { - private final MultimapSearcherBase base; - - SimpleMultimapSearcher(MultimapSearcherBase base) { - this.base = base; - } - - @Override - public long binarySearchLower(long key, long fromIndex, long n) { - return base.binarySearchLower(key, fromIndex, n); - } - - @Override - public long binarySearch(long key, long fromIndex, long n) { - return base.binarySearch(key, fromIndex, n); - } -} - - -class MaskedMultimapSearcher implements MultimapSearcher { - private final MultimapSearcherBase base; - private final long mask; - - MaskedMultimapSearcher(MultimapSearcherBase base, long mask) { - this.base = base; - this.mask = mask; - } - - @Override - public long binarySearchLower(long key, long fromIndex, long n) { - return base.binarySearchLower(key, fromIndex, n, mask); - } - - @Override - public long binarySearch(long key, long fromIndex, long n) { - return base.binarySearch(key, fromIndex, n, mask); - } -} - - -class SteppingMaskedMultimapSearcher implements MultimapSearcher { - private final MultimapSearcherBase base; - private final long mask; - private final int step; - - SteppingMaskedMultimapSearcher(MultimapSearcherBase base, long mask, int step) { - this.base = base; - this.mask = mask; - this.step = step; - } - - @Override - public long binarySearchLower(long key, long fromIndex, long n) { - return base.binarySearchLower(key, fromIndex, step, n, mask); - } - - @Override - public long binarySearch(long key, long fromIndex, long n) { - return base.binarySearch(key, fromIndex, step, n, mask); - } -} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java deleted file mode 100644 index ed1665df..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java +++ /dev/null @@ -1,86 +0,0 @@ -package nu.marginalia.util.multimap; - -import lombok.experimental.Delegate; - -public class MultimapSearcherBase { - @Delegate - private final MultimapFileLongSlice mmf; - - public MultimapSearcherBase(MultimapFileLongSlice mmf) { - this.mmf = mmf; - } - - public boolean binarySearchTest(long key, long fromIndex, long n) { - - long low = 0; - long high = n - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(fromIndex + mid); - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return true; - } - return false; - } - - public long binarySearchLower(long key, long fromIndex, long n) { - return mmf.binarySearchUpperInternal(key, fromIndex, n); - } - - - public long binarySearchLower(long key, long fromIndex, long n, long mask) { - long low = 0; - long high = n - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(fromIndex + mid) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid; - } - return fromIndex + low; - } - - - public long binarySearchLower(long key, long fromIndex, int step, long n, long mask) { - long low = 0; - long high = n - 1; - - while (low <= high) { - long mid = (low + high) >>> 1; - long midVal = get(fromIndex + mid*step) & mask; - - if (midVal < key) - low = mid + 1; - else if (midVal > key) - high = mid - 1; - else - return fromIndex + mid*step; - } - return fromIndex + low*step; - } - - public long binarySearch(long key, long fromIndex, long n) { - return mmf.binarySearchInternal(key, fromIndex, n); - } - - - public long binarySearch(long key, long fromIndex, long n, long mask) { - return mmf.binarySearchInternal(key, fromIndex, n, mask); - } - - public long binarySearch(long key, long fromIndex, int step, long n, long mask) { - return mmf.binarySearchInternal(key, fromIndex, step, n, mask); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java deleted file mode 100644 index abdeb52f..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java +++ /dev/null @@ -1,151 +0,0 @@ -package nu.marginalia.util.multimap; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.LongBuffer; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; - -import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE; - -public class MultimapSorter { - private final Path tmpFileDir; - private final MultimapFileLongSlice multimapFileLong; - private final LongBuffer buffer; - private final int internalSortLimit; - private final int wordSize; - - private static final Logger logger = LoggerFactory.getLogger(MultimapSorter.class); - - public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit, int wordSize) { - this.multimapFileLong = multimapFileLong; - this.tmpFileDir = tmpFileDir; - this.internalSortLimit = internalSortLimit; - this.wordSize = wordSize; - buffer = ByteBuffer.allocateDirect(internalSortLimit * wordSize * 8).asLongBuffer(); - } - - public void sortRange(long start, long end) throws IOException { - if (end - start < internalSortLimit) { - quickSortLH(start, end - wordSize); - } - else { - mergeSort(start, (int) (end - start)); - } - - if (MultimapSorter.class.desiredAssertionStatus()) { - for (long lp = start + wordSize; lp < end; lp += wordSize) { - if (multimapFileLong.get(lp - wordSize) > multimapFileLong.get(lp)) { - - logger.error("Sort contract breached [{}:{} ({}), ws={}, = end || bufferL < bufferR)) { - workBuffer.put(putPos, bufferL); - for (int s = 1; s < wordSize; s++) { - workBuffer.put(putPos + s, multimapFileLong.get(offset + idxL + s)); - } - idxL+= wordSize; - } - else { - workBuffer.put(putPos, bufferR); - for (int s = 1; s < wordSize; s++) { - workBuffer.put(putPos + s, multimapFileLong.get(offset + idxR + s)); - } - idxR+= wordSize; - } - } - } - - public void insertionSort(long start, int n) { - multimapFileLong.insertionSort(wordSize, start, n); - } - - private void swap(long a, long b) { - multimapFileLong.swapn(wordSize, a, b); - } - - public void quickSort(long start, long length) { - quickSortLH(start, start + length - wordSize); - - } - public void quickSortLH(long low, long highInclusive) { - - if (low >= 0 && highInclusive >= 0 && low < highInclusive) { - - if (highInclusive - low < 32) { - multimapFileLong.insertionSort(wordSize, low, (int) ((wordSize + highInclusive - low) / wordSize)); - } - else { - long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive); - - quickSortLH(low, p); - quickSortLH(p + wordSize, highInclusive); - } - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java index 4d255087..ca6f7b62 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java @@ -4,17 +4,18 @@ import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; +import it.unimi.dsi.fastutil.ints.IntArrays; import it.unimi.dsi.fastutil.ints.IntComparator; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import org.roaringbitmap.RoaringBitmap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.util.*; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Set; import java.util.function.IntToDoubleFunction; import java.util.stream.IntStream; -import it.unimi.dsi.fastutil.ints.IntArrays; public abstract class RankingAlgorithm { protected final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); @@ -154,7 +155,7 @@ public abstract class RankingAlgorithm { } - public TIntList pageRank(int resultCount) { + public RoaringBitmap pageRank(int resultCount) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; @@ -176,7 +177,7 @@ public abstract class RankingAlgorithm { return rank.getRanking(resultCount); } - public TIntList pageRankWithPeripheralNodes(int resultCount) { + public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; @@ -303,7 +304,7 @@ public abstract class RankingAlgorithm { return list; } - public TIntList getRanking(int numResults) { + public RoaringBitmap getRanking(int numResults) { if (numResults < 0) { numResults = domainIdToIndex.size(); } @@ -311,7 +312,7 @@ public abstract class RankingAlgorithm { numResults = rank.length; } - TIntArrayList list = new TIntArrayList(numResults); + RoaringBitmap list = new RoaringBitmap(); int[] nodes = new int[rank.length]; Arrays.setAll(nodes, i->i); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java index da47434f..1c2e6849 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java @@ -2,7 +2,7 @@ package nu.marginalia.util.ranking; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -87,9 +87,9 @@ public class RankingDomainFetcher { public void domainsByPattern(String pattern, IntConsumer idConsumer) { try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) { - stmt.setString(1, pattern); - var rsp = stmt.executeQuery(); + var stmt = conn.createStatement()) { + // This is sourced from a config file --v + var rsp = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE '" + pattern + "'"); while (rsp.next()) { idConsumer.accept(rsp.getInt(1)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java index b8b31c8c..89c1dfb9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java @@ -14,7 +14,7 @@ import nu.marginalia.util.ranking.RankingAlgorithm; import nu.marginalia.util.ranking.RankingDomainData; import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java index f80d307f..e251092f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java @@ -5,7 +5,7 @@ import lombok.SneakyThrows; import nu.marginalia.util.ranking.BuggyStandardPageRank; import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -51,14 +51,14 @@ public class UpdateDomainRanksTool { rankMax = spr.size()*2; uploader.start(); - spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { + var rankData = spr.pageRankWithPeripheralNodes(rankMax); + for (int i : rankData) { try { uploadQueue.put(i); } catch (InterruptedException e) { e.printStackTrace(); } - return true; - }); + } long end = System.currentTimeMillis(); running = false; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java index 05159ba9..55f16a5a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java @@ -5,7 +5,7 @@ import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,14 +41,14 @@ public class UpdateDomainRanksTool2 { rankMax = rpr.size(); uploader.start(); - rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { + var rankData = rpr.pageRankWithPeripheralNodes(rankMax); + for (int i : rankData) { try { uploadQueue.put(i); } catch (InterruptedException e) { e.printStackTrace(); } - return true; - }); + } long end = System.currentTimeMillis(); running = false; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java b/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java index 122587e6..56c9c65b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java @@ -7,7 +7,7 @@ import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; import nu.marginalia.util.AndCardIntSet; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.id.EdgeId; import org.roaringbitmap.RoaringBitmap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeWordWordConsineSimilarityMain.java b/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeWordWordConsineSimilarityMain.java new file mode 100644 index 00000000..8e71b26d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeWordWordConsineSimilarityMain.java @@ -0,0 +1,246 @@ +package nu.marginalia.util.tool; + +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; +import it.unimi.dsi.fastutil.ints.IntSet; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import lombok.SneakyThrows; +import nu.marginalia.util.AndCardIntSet; +import org.roaringbitmap.RoaringBitmap; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static nu.marginalia.util.AndCardIntSet.andCardinality; +import static nu.marginalia.util.AndCardIntSet.weightedProduct; + +public class EdgeWordWordConsineSimilarityMain { + final Object2IntOpenHashMap stringIds; + final AndCardIntSet[] dToSMap; + final float[] weights; + final boolean useWeights = false; + + enum Direction { + S_TO_D, + D_TO_S + } + + final Direction direction = Direction.D_TO_S; + + public EdgeWordWordConsineSimilarityMain(Path dataFile) throws IOException { + System.out.println("String IDs"); + stringIds = mapStringsToIds(dataFile); + + System.out.println("DtoS Map"); + dToSMap = constructDtoSMap(dataFile, stringIds); + + System.out.println("Weights"); + + if (useWeights) { + weights = new float[stringIds.size()]; + for (int i = 0; i < stringIds.size(); i++) { + weights[i] = getWeight(i); + } + } + else { + weights = null; + } + + System.out.println("Ready"); + } + + private Object2IntOpenHashMap mapStringsToIds(Path dataFile) throws IOException { + Object2IntOpenHashMap stringIds = new Object2IntOpenHashMap<>(15_000_000); + + try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) { + lines.forEach(line -> { + int tab = line.indexOf('\t'); + if (tab <= 0) + return; + + // direction doesn't matter here + String from = line.substring(0, tab); + String to = line.substring(tab + 1); + + stringIds.putIfAbsent(from, stringIds.size()); + stringIds.putIfAbsent(to, stringIds.size()); + }); + } + return stringIds; + } + + private AndCardIntSet[] constructDtoSMap(Path dataFile, Object2IntOpenHashMap stringIds) throws IOException { + Map tmpMap = new HashMap<>(15_000_000); + + try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) { + lines.forEach(line -> { + int tab = line.indexOf('\t'); + if (tab <= 0) return; + + String from, to; + if (direction == Direction.S_TO_D) { + from = line.substring(0, tab); + to = line.substring(tab + 1); + } + else { + from = line.substring(tab + 1); + to = line.substring(0, tab); + } + + tmpMap.computeIfAbsent(stringIds.getInt(to), this::createBitmapWithSelf).add(stringIds.getInt(from)); + }); + } + + AndCardIntSet[] dToSMap = new AndCardIntSet[stringIds.size()]; + tmpMap.entrySet().stream() + .filter(e -> isEligible(e.getValue())) + .forEach(e -> dToSMap[e.getKey()] = AndCardIntSet.of(e.getValue())); + + return dToSMap; + } + + private boolean isEligible(RoaringBitmap value) { + int cardinality = value.getCardinality(); + + return cardinality > 50; + } + + @SneakyThrows + public void tryDomains(String... word) { + + System.out.println(Arrays.toString(word)); + + int[] domainIds = Arrays.stream(word).mapToInt(stringIds::getInt).toArray(); + + long start = System.currentTimeMillis(); + findAdjacentDtoS(new IntOpenHashSet(domainIds), similarities -> { + Set ids = similarities.similarities().stream().map(Similarity::id).collect(Collectors.toSet()); + + Map reveseIds = new HashMap<>(similarities.similarities.size()); + + stringIds.forEach((str, id) -> { + if (ids.contains(id)) { + reveseIds.put(id, str); + } + }); + + for (var similarity : similarities.similarities()) { + System.out.println(reveseIds.get(similarity.id) + "\t" + dToSMap[similarity.id].getCardinality() + "\t" + prettyPercent(similarity.value)); + } + }); + + System.out.println(System.currentTimeMillis() - start); + } + + private String prettyPercent(double val) { + return String.format("%2.2f%%", 100. * val); + } + + + public RoaringBitmap createBitmapWithSelf(int val) { + var bm = new RoaringBitmap(); + bm.add(val); + return bm; + } + + double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) { + double andCardinality = andCardinality(a, b); + andCardinality /= Math.sqrt(a.getCardinality()); + andCardinality /= Math.sqrt(b.getCardinality()); + return andCardinality; + } + + double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) { + return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights)); + } + + float getWeight(int i) { + var vector = dToSMap[i]; + + if (vector == null) return 1.0f; + return 1.0f / (float) Math.log(2+vector.getCardinality()); + } + + record Similarities(int id, List similarities) {}; + record Similarity(int id, double value) {}; + + @SneakyThrows + private void findAdjacentDtoS(IntSet ids, Consumer andThen) { + + + AndCardIntSet[] vectors = ids.intStream().mapToObj(id -> dToSMap[id]).toArray(AndCardIntSet[]::new); + for (var vector : vectors) { + if (null == vector) + return; + } + + var vector = Arrays.stream(vectors).reduce(AndCardIntSet::and).orElseThrow(); + + List similarities = IntStream.range(0, dToSMap.length).parallel().mapToObj( + id -> vectorSimilarity(ids, vector, id)) + .filter(Objects::nonNull) + .sorted(Comparator.comparing(Similarity::value)) + .toList(); + + + andThen.accept(new Similarities(0, similarities)); + } + + double cardinalityLimit = 0.1; + + private Similarity vectorSimilarity(IntSet ids, AndCardIntSet vector, int id) { + + /* The minimum cardinality a vector can have so that + * + * a (x) b + * ------- < k is given by k^2 + * |a||b| + * + */ + + final double cardMin = Math.min(2, cardinalityLimit * cardinalityLimit * vector.getCardinality()); + + if (ids.contains(id) || id >= dToSMap.length) + return null; + + var otherVec = dToSMap[id]; + if (otherVec == null || otherVec.getCardinality() < cardMin) + return null; + + double similarity = cosineSimilarity(vector, otherVec); + + if (similarity > 0.1) { + if (useWeights) { + var recalculated = expensiveCosineSimilarity(vector, otherVec); + if (recalculated > 0.1) { + return new Similarity(id, recalculated); + } + } + else { + return new Similarity(id, similarity); + } + } + + return null; + } + + public static void main(String[] args) throws IOException { + + var main = new EdgeWordWordConsineSimilarityMain(Path.of(args[0])); + + for (;;) { + String line = System.console().readLine("Words> "); + if (line == null || line.isBlank()) { + break; + } + + main.tryDomains(line.split("\\s+")); + } + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/tool/WikipediaInternalLinkExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/util/tool/WikipediaInternalLinkExtractorMain.java new file mode 100644 index 00000000..f4f9b3dc --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/tool/WikipediaInternalLinkExtractorMain.java @@ -0,0 +1,43 @@ +package nu.marginalia.util.tool; + +import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import org.jsoup.Jsoup; + +import java.util.HashSet; +import java.util.Set; + +public class WikipediaInternalLinkExtractorMain { + public static void main(String... args) throws InterruptedException { + new WikipediaReader(args[0], new EdgeDomain("en.wikipedia.org"), wikipediaArticle -> { + + + var doc = Jsoup.parse(wikipediaArticle.body); + String path = wikipediaArticle.url.path.substring("/wiki/".length()); + + if (isIncluded(path)) { + Set seen = new HashSet<>(100); + + for (var atag : doc.getElementsByTag("a")) { + String href = atag.attr("href"); + + if (href.contains("#")) { + href = href.substring(0, href.indexOf('#')); + } + + if (isIncluded(href) && href.length() > 2 && seen.add(href)) { + System.out.println(path + "\t" + href); + } + } + } + + }).join(); + } + + private static boolean isIncluded(String href) { + return !href.contains(":") + && !href.contains("/") + && !href.contains("%") + && !href.startsWith("#"); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java index a58bd1be..4c301ef1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.api.model; import lombok.AllArgsConstructor; import lombok.Getter; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; @@ -33,12 +34,12 @@ public class ApiSearchResult { for (var entries : bySet.values()) { List lst = new ArrayList<>(); for (var entry : entries) { - var metadata = entry.metadata(); + var metadata = new EdgePageWordMetadata(entry.encodedWordMetadata()); if (metadata.isEmpty()) continue outer; - Set flags = metadata.flags().stream().map(Object::toString).collect(Collectors.toSet()); - lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(),metadata.count(), flags)); + Set flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet()); + lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), metadata.count(), flags)); } details.add(lst); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java index 46aebfc7..8c472f01 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java @@ -14,7 +14,6 @@ import nu.marginalia.wmsa.podcasts.PodcastScraperMain; import nu.marginalia.wmsa.renderer.RendererMain; import nu.marginalia.wmsa.resource_store.ResourceStoreMain; import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain; -import org.apache.logging.log4j.core.lookup.MainMapLookup; import java.util.Map; import java.util.stream.Collectors; @@ -78,7 +77,6 @@ public enum ServiceDescriptor { } public static void main(String... args) { - MainMapLookup.setMainArguments(args); Map functions = Stream.of( new ListCommand(), new StartCommand(), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java index 18778496..46324c1c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java @@ -4,7 +4,7 @@ import com.google.common.base.Strings; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.id.EdgeId; import org.apache.commons.io.IOUtils; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java index f666474b..b05ec3ad 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java @@ -6,6 +6,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; @@ -62,7 +63,7 @@ public class ConversionLog implements AutoCloseable, Interpreter { } @Override - public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {} + public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) {} @Override public void loadDomainRedirect(DomainLink link) {} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java index 168b3f47..745452be 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java @@ -27,7 +27,6 @@ public class ConverterModule extends AbstractModule { bind(Gson.class).toInstance(createGson()); bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.); - bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.); bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250); bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java index 90813636..b21fa138 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java @@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; @@ -120,7 +121,8 @@ public class LoadInstructionWriter { } @Override - public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {} + public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) { + } @Override public void loadDomainRedirect(DomainLink link) {} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java index d1c8db01..9d67ae36 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java @@ -8,6 +8,8 @@ import okhttp3.Request; import okhttp3.RequestBody; import okio.BufferedSink; import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URL; @@ -15,9 +17,9 @@ import java.nio.charset.Charset; import java.sql.SQLException; import java.util.concurrent.TimeUnit; -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; public class ReindexTriggerMain { + private static final Logger logger = LoggerFactory.getLogger(ReindexTriggerMain.class); public static void main(String... args) throws IOException, SQLException { var db = new DatabaseModule(); @@ -28,6 +30,7 @@ public class ReindexTriggerMain { .followRedirects(true) .build(); + logger.info("Updating statistics"); var updateStatistics = new UpdateDomainStatistics(db.provideConnection()); updateStatistics.run(); @@ -45,15 +48,10 @@ public class ReindexTriggerMain { } }; + logger.info("Repartitioning"); client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute(); - - if (!Boolean.getBoolean("no-preconvert")) { - client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute(); - } - - for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) { - client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute(); - } + logger.info("Reindexing"); + client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex")).build()).execute(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java index e5b18c6a..9f35a557 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java @@ -7,8 +7,6 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedD import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.index.model.IndexBlockType; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import java.util.List; @@ -41,18 +39,8 @@ public class DocumentsCompiler { var words = doc.words; if (words != null) { - - var wordsArray = words.values().stream() - .filter(this::filterNonTransients) - .map(DocumentKeywords::new) - .toArray(DocumentKeywords[]::new); - - ret.add(new LoadKeywords(doc.url, wordsArray)); + ret.add(new LoadKeywords(doc.url, doc.details.metadata, new DocumentKeywords(words))); } } - private boolean filterNonTransients(EdgePageWords words) { - return words.block.type != IndexBlockType.TRANSIENT; - } - } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java index c0698dde..5e9c3e4d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/Interpreter.java @@ -4,6 +4,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; @@ -18,7 +19,7 @@ public interface Interpreter { void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); - void loadKeywords(EdgeUrl url, DocumentKeywords[] words); + void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words); void loadDomainRedirect(DomainLink link); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java index 27e16c5b..7faefb2e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java @@ -1,18 +1,16 @@ package nu.marginalia.wmsa.edge.converting.interpreter.instruction; import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import java.util.Arrays; -public record DocumentKeywords(IndexBlock block, +public record DocumentKeywords( String[] keywords, long[] metadata) { public DocumentKeywords(EdgePageWords words) { - this(words.block, - words.words.toArray(String[]::new), + this(words.words.toArray(String[]::new), words.metadata.toArray()); } @@ -20,7 +18,7 @@ public record DocumentKeywords(IndexBlock block, public String toString() { StringBuilder sb = new StringBuilder(); sb.append(getClass().getSimpleName()); - sb.append('[').append(block).append(", "); + sb.append('['); for (int i = 0; i < keywords.length; i++) { sb.append("\n\t "); if (metadata[i] != 0) { @@ -42,6 +40,6 @@ public record DocumentKeywords(IndexBlock block, } public DocumentKeywords subList(int start, int end) { - return new DocumentKeywords(block, Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end)); + return new DocumentKeywords(Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end)); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/KeywordListChunker.java similarity index 85% rename from marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/KeywordListChunker.java index 0c99d7a1..1e30055f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/KeywordListChunker.java @@ -1,12 +1,10 @@ -package nu.marginalia.util; - -import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; +package nu.marginalia.wmsa.edge.converting.interpreter.instruction; import java.util.ArrayList; import java.util.Collections; import java.util.List; -public class ListChunker { +public class KeywordListChunker { /** Chops data into a list of lists of max length size * diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadKeywords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadKeywords.java index 7f12bf67..106f02b7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadKeywords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadKeywords.java @@ -3,20 +3,19 @@ package nu.marginalia.wmsa.edge.converting.interpreter.instruction; import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; import nu.marginalia.wmsa.edge.converting.interpreter.InstructionTag; import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import java.util.Arrays; - -public record LoadKeywords(EdgeUrl url, DocumentKeywords... words) implements Instruction { +public record LoadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) implements Instruction { @Override public void apply(Interpreter interpreter) { - interpreter.loadKeywords(url, words); + interpreter.loadKeywords(url, metadata, words); } @Override public boolean isNoOp() { - return words.length == 0; + return false; } @Override @@ -26,7 +25,7 @@ public record LoadKeywords(EdgeUrl url, DocumentKeywords... words) implements In @Override public String toString() { - return getClass().getSimpleName()+"["+ Arrays.toString(words)+"]"; + return getClass().getSimpleName()+"["+ words+"]"; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java index ae118236..d12aa7ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java @@ -5,6 +5,7 @@ import lombok.SneakyThrows; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; import org.slf4j.Logger; @@ -19,7 +20,7 @@ public class IndexLoadKeywords implements Runnable { private final LinkedBlockingQueue insertQueue = new LinkedBlockingQueue<>(32); private final EdgeIndexWriterClient client; - private record InsertTask(int urlId, int domainId, DocumentKeywords wordSet) {} + private record InsertTask(int urlId, int domainId, EdgePageDocumentsMetadata metadata, DocumentKeywords wordSet) {} private final Thread runThread; private volatile boolean canceled = false; @@ -38,7 +39,7 @@ public class IndexLoadKeywords implements Runnable { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index); + client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet, index); } } } @@ -48,7 +49,7 @@ public class IndexLoadKeywords implements Runnable { runThread.join(); } - public void load(LoaderData loaderData, EdgeUrl url, DocumentKeywords[] words) throws InterruptedException { + public void load(LoaderData loaderData, EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) throws InterruptedException { int domainId = loaderData.getDomainId(url.domain); int urlId = loaderData.getUrlId(url); @@ -57,8 +58,6 @@ public class IndexLoadKeywords implements Runnable { return; } - for (var ws : words) { - insertQueue.put(new InsertTask(urlId, domainId, ws)); - } + insertQueue.put(new InsertTask(urlId, domainId, metadata, words)); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java index 50de0d2c..ba55ea10 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/Loader.java @@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; @@ -108,8 +109,8 @@ public class Loader implements Interpreter { } @Override - public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) { - logger.debug("loadKeywords(#{})", words.length); + public void loadKeywords(EdgeUrl url, EdgePageDocumentsMetadata metadata, DocumentKeywords words) { + logger.debug("loadKeywords()"); // This is a bit of a bandaid safeguard against a bug in // in the converter, shouldn't be necessary in the future @@ -124,7 +125,7 @@ public class Loader implements Interpreter { } try { - indexLoadKeywords.load(data, url, words); + indexLoadKeywords.load(data, url, metadata, words); } catch (InterruptedException e) { throw new RuntimeException(e); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java index 67e0f0df..df3367cc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.converting.model; import lombok.ToString; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; import java.util.OptionalDouble; @@ -12,7 +12,7 @@ public class ProcessedDocument { public EdgeUrl url; public ProcessedDocumentDetails details; - public EdgePageWordSet words; + public EdgePageWords words; public EdgeUrlState state; public String stateReason; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java index 25afe126..29b2ecc3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocumentDetails.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.converting.model; import lombok.ToString; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; @@ -27,4 +28,6 @@ public class ProcessedDocumentDetails { public List linksInternal; public List linksExternal; public List feedLinks; + + public EdgePageDocumentsMetadata metadata; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 7c953074..339389fe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.converting.processor; import com.google.common.hash.HashCode; import com.google.inject.Inject; import com.google.inject.name.Named; +import nu.marginalia.util.gregex.GuardedRegex; +import nu.marginalia.util.gregex.GuardedRegexFactory; import nu.marginalia.util.language.LanguageFilter; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; @@ -18,11 +20,12 @@ import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -128,10 +131,10 @@ public class DocumentProcessor { ret.url = getDocumentUrl(crawledDocument); ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); - var detailsWithWordsLinks = createDetails(crawledDomain, crawledDocument); + var detailsWithWords = createDetails(crawledDomain, crawledDocument); - ret.details = detailsWithWordsLinks.details(); - ret.words = detailsWithWordsLinks.words(); + ret.details = detailsWithWords.details(); + ret.words = detailsWithWords.words(); } @@ -212,12 +215,11 @@ public class DocumentProcessor { ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld); ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); - - KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality); + KeywordMetadata keywordMetadata = new KeywordMetadata(); PubDate pubDate; - EdgePageWordSet words; - if (shouldDoSimpleProcessing(url, ret)) { + EdgePageWords words; + if (shouldDoSimpleProcessing(url, dld, ret)) { /* Some documents we'll index, but only superficially. This is a compromise to allow them to be discoverable, without having them show up without specific queries. This also saves a lot of processing power. @@ -227,6 +229,9 @@ public class DocumentProcessor { ret.description = ""; pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, false); + + ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.Simple)); + } else { ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); @@ -234,6 +239,8 @@ public class DocumentProcessor { ret.description = getDescription(doc); pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); + + ret.metadata = new EdgePageDocumentsMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class)); } addMetaWords(ret, url, pubDate, crawledDomain, words); @@ -247,11 +254,15 @@ public class DocumentProcessor { return new DetailsWithWords(ret, words); } - private boolean shouldDoSimpleProcessing(EdgeUrl url, ProcessedDocumentDetails ret) { + private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$"); + + private boolean shouldDoSimpleProcessing(EdgeUrl url, DocumentLanguageData dld, ProcessedDocumentDetails ret) { if (ret.quality < minDocumentQuality) { return true; } - + if (dld.totalNumWords() < minDocumentLength) { + return true; + } // These pages shouldn't be publicly accessible if ("phpinfo()".equals(ret.title)) { return true; @@ -261,9 +272,7 @@ public class DocumentProcessor { // we don't want to index them because they change so rapidly; subdirectories are // fine though // - // The first startsWith criteria is a performance optimization, even with a compiled - // pattern it is something like 50x faster - if (url.path.startsWith("/@") && url.path.matches("^/@[^/]+/?$")) { + if (mastodonFeedRegex.test(url.path)) { return true; } @@ -274,7 +283,7 @@ public class DocumentProcessor { return false; } - private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, PubDate pubDate, CrawledDomain domain, EdgePageWordSet words) { + private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, PubDate pubDate, CrawledDomain domain, EdgePageWords words) { List tagWords = new ArrayList<>(); var edgeDomain = url.domain; @@ -285,6 +294,8 @@ public class DocumentProcessor { tagWords.add("site:" + edgeDomain.domain.toLowerCase()); } + tagWords.add("tld:" + edgeDomain.getTld()); + tagWords.add("proto:"+url.proto.toLowerCase()); tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase()); @@ -301,10 +312,10 @@ public class DocumentProcessor { tagWords.add("pub:" + pubDate.dateIso8601()); } - words.appendWithNoMeta(IndexBlock.Meta, tagWords); + words.addAllSyntheticTerms(tagWords); } - private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { + private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWords words) { final LinkProcessor lp = new LinkProcessor(ret, baseUrl); @@ -339,17 +350,17 @@ public class DocumentProcessor { createFileLinkKeywords(words, lp, domain); } - private void createLinkKeywords(EdgePageWordSet words, LinkProcessor lp) { + private void createLinkKeywords(EdgePageWords words, LinkProcessor lp) { final Set linkTerms = new HashSet<>(); for (var fd : lp.getForeignDomains()) { linkTerms.add("links:"+fd.toString().toLowerCase()); linkTerms.add("links:"+fd.getDomain().toLowerCase()); } - words.appendWithNoMeta(IndexBlock.Meta, linkTerms); + words.addAllSyntheticTerms(linkTerms); } - private void createFileLinkKeywords(EdgePageWordSet words, LinkProcessor lp, EdgeDomain domain) { + private void createFileLinkKeywords(EdgePageWords words, LinkProcessor lp, EdgeDomain domain) { Set fileKeywords = new HashSet<>(100); for (var link : lp.getNonIndexableUrls()) { @@ -361,7 +372,7 @@ public class DocumentProcessor { } - words.appendWithNoMeta(IndexBlock.Artifacts, fileKeywords); + words.addAllSyntheticTerms(fileKeywords); } private void synthesizeFilenameKeyword(Set fileKeywords, EdgeUrl link) { @@ -383,10 +394,6 @@ public class DocumentProcessor { } private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { - if (dld.totalNumWords() < minDocumentLength) { - throw new DisqualifiedException(DisqualificationReason.LENGTH); - } - double languageAgreement = languageFilter.dictionaryAgreement(dld); if (languageAgreement < 0.1) { throw new DisqualifiedException(DisqualificationReason.LANGUAGE); @@ -411,6 +418,6 @@ public class DocumentProcessor { } private record DetailsWithWords(ProcessedDocumentDetails details, - EdgePageWordSet words) {} + EdgePageWords words) {} } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index 4e04e3fa..d9ff7ef1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -2,44 +2,34 @@ package nu.marginalia.wmsa.edge.converting.processor; import com.google.common.base.Strings; import com.google.inject.Inject; -import com.google.inject.name.Named; -import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; +import nu.marginalia.util.StringPool; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; -import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor; import nu.marginalia.wmsa.edge.converting.processor.logic.InternalLinkGraph; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.model.IndexBlockType; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import java.util.*; -import java.util.stream.Collectors; import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL; public class DomainProcessor { - private static final CommonKeywordExtractor commonKeywordExtractor = new CommonKeywordExtractor(); - private final DocumentProcessor documentProcessor; - private final Double minAvgDocumentQuality; - - + private final SiteWords siteWords; @Inject public DomainProcessor(DocumentProcessor documentProcessor, - @Named("min-avg-document-quality") Double minAvgDocumentQuality - ) { + SiteWords siteWords) { this.documentProcessor = documentProcessor; - this.minAvgDocumentQuality = minAvgDocumentQuality; + this.siteWords = siteWords; } public ProcessedDomain process(CrawledDomain crawledDomain) { var ret = new ProcessedDomain(); + ret.domain = new EdgeDomain(crawledDomain.domain); ret.ip = crawledDomain.ip; @@ -52,35 +42,37 @@ public class DomainProcessor { fixBadCanonicalTags(crawledDomain.doc); - InternalLinkGraph internalLinkGraph = new InternalLinkGraph(); + StringPool stringPool = new StringPool(1000 + 100 * crawledDomain.doc.size()); - DocumentDisqualifier disqualifier = new DocumentDisqualifier(); for (var doc : crawledDomain.doc) { - if (disqualifier.isQualified()) { - var processedDoc = documentProcessor.process(doc, crawledDomain); + var processedDoc = documentProcessor.process(doc, crawledDomain); - if (processedDoc.url != null) { - ret.documents.add(processedDoc); + if (processedDoc.words != null) { + // The word data is extremely redundant, and may encompass something like + // 5,000,000 words per domain (and multiple domains are processed at the same time). - internalLinkGraph.accept(processedDoc); - - processedDoc.quality().ifPresent(disqualifier::offer); - } - else if ("LANGUAGE".equals(processedDoc.stateReason)) { - disqualifier.offer(-100); - } + processedDoc.words.internalize(stringPool::internalize); } - else { // Short-circuit processing if quality is too low - var stub = documentProcessor.makeDisqualifiedStub(doc); - stub.stateReason = DisqualifiedException.DisqualificationReason.SHORT_CIRCUIT.toString(); - if (stub.url != null) { - ret.documents.add(stub); - } + + if (processedDoc.url != null) { + ret.documents.add(processedDoc); } + } - flagCommonSiteWords(ret); - flagAdjacentSiteWords(internalLinkGraph, ret); + stringPool.flush(); + + InternalLinkGraph internalLinkGraph = new InternalLinkGraph(); + + ret.documents.forEach(internalLinkGraph::accept); + ret.documents.forEach(doc -> { + if (doc.details != null && doc.details.metadata != null) { + doc.details.metadata = doc.details.metadata.withSize(internalLinkGraph.numKnownUrls()); + } + }); + + siteWords.flagCommonSiteWords(ret); + siteWords.flagAdjacentWords(internalLinkGraph, ret); } else { @@ -92,70 +84,6 @@ public class DomainProcessor { return ret; } - private void flagCommonSiteWords(ProcessedDomain processedDomain) { - Set commonSiteWords = new HashSet<>(10); - - commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Tfidf_High, IndexBlock.Subjects)); - commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Title)); - - if (commonSiteWords.isEmpty()) { - return; - } - - for (var doc : processedDomain.documents) { - if (doc.words != null) { - for (var block : IndexBlock.values()) { - if (block.type == IndexBlockType.PAGE_DATA) { - doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.Site, commonSiteWords); - } - } - } - } - } - - private void flagAdjacentSiteWords(InternalLinkGraph internalLinkGraph, ProcessedDomain processedDomain) { - var invertedGraph = internalLinkGraph.trimAndInvert(); - - Map> linkedKeywords = new HashMap<>(100); - - invertedGraph.forEach((url, linkingUrls) -> { - Map keywords = new HashMap<>(100); - - for (var linkingUrl : linkingUrls) { - for (var keyword : internalLinkGraph.getKeywords(linkingUrl)) { - keywords.merge(keyword, 1, Integer::sum); - } - } - - var words = keywords.entrySet().stream() - .filter(e -> e.getValue() > 3) - .map(Map.Entry::getKey) - .filter(internalLinkGraph.getCandidateKeywords(url)::contains) - .collect(Collectors.toSet()); - if (!words.isEmpty()) { - linkedKeywords.put(url, words); - } - }); - - for (var doc : processedDomain.documents) { - if (doc.words == null) - continue; - - final Set keywords = linkedKeywords.get(doc.url); - if (keywords == null) - continue; - - for (var block : IndexBlock.values()) { - if (block.type == IndexBlockType.PAGE_DATA) { - doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.SiteAdjacent, keywords); - } - } - } - - - } - - private void fixBadCanonicalTags(List docs) { Map> seenCanonicals = new HashMap<>(); Set seenUrls = new HashSet<>(); @@ -164,7 +92,8 @@ public class DomainProcessor { // this removes such links from consideration for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) && !Objects.equals(document.canonicalUrl, document.url)) { + if (!Strings.isNullOrEmpty(document.canonicalUrl) + && !Objects.equals(document.canonicalUrl, document.url)) { seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash); } seenUrls.add(document.url); @@ -201,7 +130,9 @@ public class DomainProcessor { Optional cUrl = EdgeUrl.parse(document.canonicalUrl); Optional dUrl = EdgeUrl.parse(document.url); - if (cUrl.isPresent() && dUrl.isPresent() && !Objects.equals(cUrl.get().domain, dUrl.get().domain)) { + if (cUrl.isPresent() && dUrl.isPresent() + && !Objects.equals(cUrl.get().domain, dUrl.get().domain)) + { document.canonicalUrl = document.url; } } @@ -216,20 +147,4 @@ public class DomainProcessor { }; } - class DocumentDisqualifier { - int count; - int goodCount; - - void offer(double quality) { - count++; - if (quality > minAvgDocumentQuality) { - goodCount++; - } - } - - boolean isQualified() { - return true; -// return count < 25 || goodCount*10 >= count; - } - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java new file mode 100644 index 00000000..b5a5191f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java @@ -0,0 +1,85 @@ +package nu.marginalia.wmsa.edge.converting.processor; + +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; +import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor; +import nu.marginalia.wmsa.edge.converting.processor.logic.InternalLinkGraph; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.model.EdgeUrl; + +import javax.inject.Singleton; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +@Singleton +public class SiteWords { + + private static final CommonKeywordExtractor commonKeywordExtractor = new CommonKeywordExtractor(); + + public void flagAdjacentWords(InternalLinkGraph internalLinkGraph, ProcessedDomain processedDomain) { + Map> linkedKeywords = getAdjacentWords(internalLinkGraph); + + for (var doc : processedDomain.documents) { + applyKeywordsToDoc(doc, EdgePageWordFlags.SiteAdjacent, linkedKeywords.get(doc.url)); + } + + } + + public void flagCommonSiteWords(ProcessedDomain processedDomain) { + Set commonSiteWords = new HashSet<>(10); + + commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, + EdgePageWordFlags.Subjects, + EdgePageWordFlags.TfIdfHigh)); + + commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, + EdgePageWordFlags.Title)); + + if (commonSiteWords.isEmpty()) { + return; + } + + for (var doc : processedDomain.documents) { + applyKeywordsToDoc(doc, EdgePageWordFlags.Site, commonSiteWords); + } + } + + private Map> getAdjacentWords(InternalLinkGraph internalLinkGraph) { + + final Map> invertedGraph = internalLinkGraph.trimAndInvert(); + final Map> linkedKeywords = new HashMap<>(100); + + invertedGraph.forEach((url, linkingUrls) -> { + Object2IntOpenHashMap keywords = new Object2IntOpenHashMap<>(100); + + for (var linkingUrl : linkingUrls) { + for (var keyword : internalLinkGraph.getKeywords(linkingUrl)) { + keywords.mergeInt(keyword, 1, Integer::sum); + } + } + + var words = keywords.object2IntEntrySet().stream() + .filter(e -> e.getIntValue() > 3) + .map(Map.Entry::getKey) + .filter(internalLinkGraph.getCandidateKeywords(url)::contains) + .collect(Collectors.toSet()); + if (!words.isEmpty()) { + linkedKeywords.put(url, words); + } + }); + + return linkedKeywords; + } + + private void applyKeywordsToDoc(ProcessedDocument doc, EdgePageWordFlags flag, Set words) { + if (doc.words != null && words != null) { + doc.words.setFlagOnMetadataForWords(flag, words); + } + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java index 7628e09a..bcd2d505 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import ca.rmen.porterstemmer.PorterStemmer; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; import java.util.*; @@ -16,7 +16,7 @@ public class CommonKeywordExtractor { private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5; - public List getCommonSiteWords(ProcessedDomain ret, IndexBlock... sourceBlocks) { + public List getCommonSiteWords(ProcessedDomain ret, EdgePageWordFlags... flags) { if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS) return Collections.emptyList(); @@ -27,21 +27,20 @@ public class CommonKeywordExtractor { final Map> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10); int qualifiedDocCount = 0; + long wordFlags = Arrays.stream(flags).mapToInt(EdgePageWordFlags::asBit).reduce(0, (a,b) -> a|b); for (var doc : ret.documents) { if (doc.words == null) continue; qualifiedDocCount++; - for (var block : sourceBlocks) { - for (var word : doc.words.get(block).words) { - String wordStemmed = wordToStemmedMemoized.computeIfAbsent(word, ps::stemWord); + for (var word : doc.words.getWordsWithAnyFlag(wordFlags)) { + String wordStemmed = wordToStemmedMemoized.computeIfAbsent(word, ps::stemWord); - // Count by negative values to sort by Map.Entry.comparingByValue() in reverse - topStemmedKeywordCount.merge(wordStemmed, -1, Integer::sum); + // Count by negative values to sort by Map.Entry.comparingByValue() in reverse + topStemmedKeywordCount.merge(wordStemmed, -1, Integer::sum); - stemmedToNonstemmedVariants.computeIfAbsent(wordStemmed, w -> new HashSet<>()).add(word); - } + stemmedToNonstemmedVariants.computeIfAbsent(wordStemmed, w -> new HashSet<>()).add(word); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java index abb2b619..6b0cd10f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java @@ -1,7 +1,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; import nu.marginalia.wmsa.edge.model.EdgeUrl; import java.util.*; @@ -12,21 +12,20 @@ public class InternalLinkGraph { private final Map> topKeywordsByUrl = new HashMap<>(1000); private final Map> candidateKeywordsByUrl = new HashMap<>(1000); + private final Set knownUrls = new HashSet<>(10_000); + public void accept(ProcessedDocument doc) { if (doc.details == null || doc.details.linksInternal == null) return; goodUrls.add(doc.url); internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal)); + knownUrls.addAll(doc.details.linksInternal); - Set topKeywords = new HashSet<>(doc.words.get(IndexBlock.Tfidf_High).words); - topKeywords.addAll(doc.words.get(IndexBlock.Subjects).words); - topKeywordsByUrl.put(doc.url, topKeywords); + List topKeywords = doc.words.getWordsWithAnyFlag(EdgePageWordFlags.TfIdfHigh.asBit() | EdgePageWordFlags.Subjects.asBit()); - Set candidateKeywords = new HashSet<>(topKeywords); - candidateKeywords.addAll(doc.words.get(IndexBlock.Tfidf_High).words); - candidateKeywords.addAll(doc.words.get(IndexBlock.Subjects).words); - candidateKeywordsByUrl.put(doc.url, candidateKeywords); + topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords)); + candidateKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords)); } public Map> trimAndInvert() { @@ -45,6 +44,10 @@ public class InternalLinkGraph { return inverted; } + public int numKnownUrls() { + return knownUrls.size(); + } + public Set getKeywords(EdgeUrl url) { return topKeywordsByUrl.getOrDefault(url, Collections.emptySet()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java index ed4f0f63..7913e710 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDate.java @@ -8,7 +8,7 @@ public record PubDate(String dateIso8601, int year) { // First year we'll believe something can have been published on the web // cut off at 1995 to reduce false positive error rate; number of bona fide // documents from these years are so few almost all hits are wrong - + public static final int MIN_YEAR = 1995; // Last year we'll believe something can be published in @@ -23,6 +23,7 @@ public record PubDate(String dateIso8601, int year) { this(date.format(DateTimeFormatter.ISO_DATE), date.getYear()); } + public boolean isEmpty() { return year == Integer.MIN_VALUE; } @@ -43,4 +44,18 @@ public record PubDate(String dateIso8601, int year) { public boolean hasYear() { return isValidYear(this.year); } + + private static final int ENCODING_OFFSET = MIN_YEAR + 1; + + public int yearByte() { + if (hasYear()) { + return year - ENCODING_OFFSET; + } + else return 0; + } + + public static int fromYearByte(int yearByte) { + return yearByte + ENCODING_OFFSET; + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java index 5a86fdd8..2c3a373b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java @@ -8,7 +8,7 @@ import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import org.mariadb.jdbc.Driver; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java deleted file mode 100644 index ac90ac12..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java +++ /dev/null @@ -1,204 +0,0 @@ -package nu.marginalia.wmsa.edge.crawling; - -import com.github.luben.zstd.ZstdOutputStream; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import com.google.gson.Gson; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.util.ranking.BetterReversePageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import org.mariadb.jdbc.Driver; - -import java.io.BufferedOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.file.Path; -import java.sql.Connection; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.ArrayList; - -public class CrawlJobExtractorPageRankMain { - - private static final String specificDomainSql = - """ - SELECT ID - FROM EC_DOMAIN - WHERE DOMAIN_NAME=? - """; - private static final String specificDomainSqlFromId = - """ - SELECT LOWER(DOMAIN_NAME) - FROM EC_DOMAIN - WHERE ID=? - """; - - private static final String urlsSql = - """ - SELECT URL - FROM EC_URL_VIEW - WHERE DOMAIN_ID=? - ORDER BY - VISITED DESC, - DATA_HASH IS NOT NULL DESC, - ID - LIMIT 25000 - """; - - private static final String visitedUrlsSql = - """ - SELECT COUNT(*) - FROM EC_URL - WHERE DOMAIN_ID=? - AND VISITED - ; - """; - private static final int MIN_VISIT_COUNT = 100; - private static final int MAX_VISIT_COUNT = 5000; - - private final EdgeDomainBlacklistImpl blacklist; - - private final Connection conn; - private final HashFunction hasher = Hashing.murmur3_128(0); - - public static void main(String... args) throws SQLException, IOException { - Driver driver = new Driver(); - var outFile = Path.of(args[0]); - - Gson gson = GsonFactory.get(); - - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); - rpr.setMaxKnownUrls(750); - - var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size()); - - try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { - final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection()); - - targetDomainIds.forEach(i -> { - out.println(gson.toJson(extractor.extractDomain(new EdgeId<>(i)))); - return true; - }); - } - } - - private record DomainWithId(String domainName, int id) {} - - public CrawlJobExtractorPageRankMain(HikariDataSource ds) throws SQLException { - blacklist = new EdgeDomainBlacklistImpl(ds); - conn = ds.getConnection(); - } - - public CrawlingSpecification extractDomain(EdgeId domainId) { - CrawlingSpecification spec = new CrawlingSpecification(); - - String domainName = ""; - try (var domainQuery = conn.prepareStatement(specificDomainSqlFromId); - var urlQuery = conn.prepareStatement(urlsSql)) - { - domainQuery.setInt(1, domainId.id()); - ResultSet rsp = domainQuery.executeQuery(); - domainName = rsp.next() ? rsp.getString(1) : ""; - - spec.domain = domainName; - spec.id = createId(new EdgeDomain(domainName)); - spec.urls = new ArrayList<>(1000); - - spec.crawlDepth = getCrawlDepth(new DomainWithId(domainName, domainId.id())); - - urlQuery.setString(1, domainName.toString()); - urlQuery.setInt(2, domainId.id()); - urlQuery.setFetchSize(1000); - rsp = urlQuery.executeQuery(); - - while (rsp.next()) { - spec.urls.add(rsp.getString(1)); - } - - } catch (SQLException e) { - e.printStackTrace(); - } - - if (spec.urls.isEmpty()) { - spec.urls.add("https://"+domainName+"/"); - } - - return spec; - } - public CrawlingSpecification extractDomain(EdgeDomain domain) { - CrawlingSpecification spec = new CrawlingSpecification(); - spec.domain = domain.toString(); - spec.id = createId(domain); - spec.urls = new ArrayList<>(1000); - - - try (var domainQuery = conn.prepareStatement(specificDomainSql); - var urlQuery = conn.prepareStatement(urlsSql)) - { - domainQuery.setString(1, domain.toString()); - ResultSet rsp = domainQuery.executeQuery(); - int domainId = rsp.next() ? rsp.getInt(1) : -1; - - spec.crawlDepth = getCrawlDepth(new DomainWithId(domain.toString(), domainId)); - - urlQuery.setString(1, domain.toString()); - urlQuery.setInt(2, domainId); - urlQuery.setFetchSize(1000); - rsp = urlQuery.executeQuery(); - - while (rsp.next()) { - spec.urls.add(rsp.getString(1)); - } - - } catch (SQLException e) { - e.printStackTrace(); - } - - if (spec.urls.isEmpty()) { - spec.urls.add("https://"+domain+"/"); - } - - return spec; - } - - private String createId(EdgeDomain domain) { - return hasher.hashUnencodedChars(domain.toString()).toString(); - } - - private int getCrawlDepth(DomainWithId domainWithId) { - try (var domainQuery = conn.prepareStatement(visitedUrlsSql)) { - domainQuery.setInt(1, domainWithId.id); - var rsp = domainQuery.executeQuery(); - if (rsp.next()) { - return calculateCrawlDepthFromVisitedCount(rsp.getInt(1)); - } - } catch (SQLException e) { - e.printStackTrace(); - } - - return MIN_VISIT_COUNT; - } - - private int calculateCrawlDepthFromVisitedCount(int count) { - count = count + 100 + count / 4; - - if (count < MIN_VISIT_COUNT) { - count = MIN_VISIT_COUNT; - } - - if (count > MAX_VISIT_COUNT) { - count = MAX_VISIT_COUNT; - } - - return count; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/InetAddressCache.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/InetAddressCache.java index 17e90c04..0bef701d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/InetAddressCache.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/InetAddressCache.java @@ -12,9 +12,7 @@ public class InetAddressCache { private static final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); public static InetAddress getAddress(EdgeDomain domain) throws Throwable { try { - return cache.get(domain, ()->{ - return InetAddress.getByName(domain.getAddress()); - }); + return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress())); } catch (ExecutionException ex) { throw ex.getCause(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index 00089eb4..c8bfbb11 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -1,27 +1,16 @@ package nu.marginalia.wmsa.edge.crawling.blocklist; +import nu.marginalia.util.gregex.GuardedRegexFactory; import nu.marginalia.wmsa.edge.model.EdgeUrl; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.function.Predicate; -import java.util.regex.Pattern; public class UrlBlocklist { private final List> patterns = new ArrayList<>(); - private record UrlPatternContains(String contains, Pattern pattern) implements Predicate { - public boolean test(String s) { - return s.contains(contains) && pattern.matcher(s).find(); - } - } - private record UrlPatternMinLength(int minLength, Pattern pattern) implements Predicate { - public boolean test(String s) { - return s.length() >= minLength && pattern.matcher(s).find(); - } - } - // domains that have a lot of links but we know we don't want to crawl private final Set badDomains = Set.of("t.co", "facebook.com", "instagram.com", "youtube.com", @@ -35,17 +24,17 @@ public class UrlBlocklist { patterns.add(s -> s.contains("-download-free")); // long base64-strings in URLs are typically git hashes or the like, rarely worth crawling - patterns.add(new UrlPatternMinLength(48, Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)"))); + patterns.add(GuardedRegexFactory.minLength(48, ".*/[^/]*[a-f0-9]{32,}(/|$)")); // link farms &c - patterns.add(new UrlPatternContains("/download", Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$"))); - patterns.add(new UrlPatternContains("/permalink", Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$"))); - patterns.add(new UrlPatternContains("/webrx", Pattern.compile("webrx3.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); - patterns.add(new UrlPatternContains("/lib", Pattern.compile("lib.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); - patterns.add(new UrlPatternContains("/pdf", Pattern.compile("pdf.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); - patterns.add(new UrlPatternContains("/book", Pattern.compile("book.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); - patterns.add(new UrlPatternContains("/720p", Pattern.compile("720p.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$"))); - patterns.add(new UrlPatternContains("/node", Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$"))); + patterns.add(GuardedRegexFactory.contains("/download", "/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$")); + patterns.add(GuardedRegexFactory.contains("/permalink", "/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$")); + patterns.add(GuardedRegexFactory.contains("webrx", "webrx3.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")); + patterns.add(GuardedRegexFactory.contains("lib", "lib.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")); + patterns.add(GuardedRegexFactory.contains("pdf", "pdf.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")); + patterns.add(GuardedRegexFactory.contains("book", "book.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")); + patterns.add(GuardedRegexFactory.contains("/720p", "720p.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")); + patterns.add(GuardedRegexFactory.contains("/node","/node/.*/[a-z]+(-[a-z0-9]+)+.htm$")); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingService.java index 2cebe7dc..67753527 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingService.java @@ -7,8 +7,8 @@ import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.model.id.EdgeId; import nu.marginalia.wmsa.edge.search.model.BrowseResult; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingSessionObject.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingSessionObject.java index 61d9789c..39eed0a6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingSessionObject.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dating/DatingSessionObject.java @@ -1,7 +1,7 @@ package nu.marginalia.wmsa.edge.dating; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.id.EdgeId; import nu.marginalia.wmsa.edge.search.model.BrowseResult; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDao.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDao.java index f2530c9f..47053837 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDao.java @@ -1,7 +1,6 @@ -package nu.marginalia.wmsa.edge.data.dao; +package nu.marginalia.wmsa.edge.dbcommon; import com.google.inject.ImplementedBy; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDaoImpl.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDaoImpl.java index 7c0c681b..1fdf93ef 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDataStoreDaoImpl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.data.dao; +package nu.marginalia.wmsa.edge.dbcommon; import com.google.common.base.Strings; import com.google.common.cache.Cache; @@ -7,7 +7,6 @@ import com.google.common.util.concurrent.UncheckedExecutionException; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; @@ -134,6 +133,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { } + public List getDomainNeighborsAdjacentCosine(EdgeId domainId, EdgeDomainBlacklist blacklist, int count) { List domains = new ArrayList<>(count); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklist.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklist.java index da995195..5fa6f193 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklist.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.data.dao.task; +package nu.marginalia.wmsa.edge.dbcommon; import com.google.inject.ImplementedBy; import gnu.trove.set.hash.TIntHashSet; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklistImpl.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklistImpl.java index 2e744ed8..13d7080f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/dbcommon/EdgeDomainBlacklistImpl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.data.dao.task; +package nu.marginalia.wmsa.edge.dbcommon; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java deleted file mode 100644 index 3633f307..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java +++ /dev/null @@ -1,166 +0,0 @@ -package nu.marginalia.wmsa.edge.index; - -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams; -import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryRankLimitingFilter; -import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Collections; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; -import java.util.function.LongPredicate; - -public class EdgeIndexBucket { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private volatile SearchIndexReader indexReader; - - private final ReadWriteLock indexReplacementLock = new ReentrantReadWriteLock(); - - @NotNull - private final IndexServicesFactory servicesFactory; - private final EdgeIndexControl indexControl; - private final SearchIndexJournalWriter writer; - - private final int id; - - public EdgeIndexBucket(@NotNull IndexServicesFactory servicesFactory, EdgeIndexControl indexControl, int id) { - this.servicesFactory = servicesFactory; - this.indexControl = indexControl; - this.id = id; - - writer = servicesFactory.getIndexWriter(0); - } - - public void init() { - Lock lock = indexReplacementLock.writeLock(); - try { - lock.lock(); - logger.info("Initializing bucket {}", id); - - if (indexReader == null) { - indexReader = servicesFactory.getIndexReader(id); - } - - } - catch (Exception ex) { - logger.error("Uncaught exception", ex); - } - finally { - lock.unlock(); - } - } - - public void preconvert() { - - writer.forceWrite(); - writer.flushWords(); - - servicesFactory.getIndexPreconverter(); - - System.runFinalization(); - System.gc(); - - } - public void switchIndex() { - - indexControl.regenerateIndex(id); - - Lock lock = indexReplacementLock.writeLock(); - try { - lock.lock(); - - indexControl.switchIndexFiles(id); - - if (indexReader != null) { - indexReader.close(); - } - - indexReader = servicesFactory.getIndexReader(id); - - } - catch (Exception ex) { - logger.error("Uncaught exception", ex); - } - finally { - lock.unlock(); - } - } - - - public boolean isAvailable() { - return indexReader != null; - } - - public IndexQuery getQuery(LongPredicate filter, IndexQueryParams params) { - - if (null == indexReader) { - logger.warn("Index reader not neady {}", params.block()); - return new IndexQuery(Collections.emptyList()); - } - - final int[] orderedIncludes = params.searchTerms() - .sortedDistinctIncludes((a, b) -> compareKeywords(params.block(), a, b)); - - IndexQueryFactory.IndexQueryBuilder query = createQueryBuilder(orderedIncludes[0], params); - - if (query == null) { - return new IndexQuery(Collections.emptyList()); - } - - query.addInclusionFilter(new QueryFilterStepFromPredicate(filter)); - if (params.rankLimit() != null) { - query.addInclusionFilter(new QueryRankLimitingFilter(params.rankLimit())); - } - - for (int i = 1; i < orderedIncludes.length; i++) { - query = query.also(orderedIncludes[i]); - } - - for (int term : params.searchTerms().excludes()) { - query = query.not(term); - } - - return query.build(); - } - - private IndexQueryFactory.IndexQueryBuilder createQueryBuilder(int firstKeyword, IndexQueryParams params) { - - if (params.targetDomains() != null && !params.targetDomains().isEmpty()) { - return indexReader.findWordForDomainList(params.block(), params.targetDomains(), firstKeyword); - } - return indexReader.findWord(params.block(), params.qualityLimit(), firstKeyword); - - } - - private int compareKeywords(IndexBlock block, int a, int b) { - return Long.compare( - indexReader.numHits(block, a), - indexReader.numHits(block, b) - ); - } - - - public IndexQuery getDomainQuery(int wordId, ResultDomainDeduplicator localFilter) { - var query = indexReader.findDomain(wordId); - - query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue)); - - return query; - } - - /** Replaces the values of ids with their associated metadata, or 0L if absent */ - public long[] getMetadata(IndexBlock block, int termId, long[] ids) { - return indexReader.getMetadata(block, termId, ids); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index 577553b6..980d0d32 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -2,8 +2,6 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.Inject; -import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import java.io.IOException; @@ -17,27 +15,13 @@ public class EdgeIndexControl { this.servicesFactory = servicesFactory; } - public void regenerateIndex(int id) { - for (IndexBlock block : IndexBlock.values()) { - try { - servicesFactory.convertIndex(id, block); + public void regenerateIndex() throws IOException { + servicesFactory.convertIndex(); - System.runFinalization(); - System.gc(); - } - catch (ConversionUnnecessaryException unnecessary) { - // swallow quietly - } - catch (IOException e) { - e.printStackTrace(); - } - } - - System.runFinalization(); System.gc(); } - public void switchIndexFiles(int id) throws Exception { - servicesFactory.switchFilesJob(id).call(); + public void switchIndexFiles() throws Exception { + servicesFactory.switchFilesJob().call(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexMain.java index 65dc5bf7..65dde030 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexMain.java @@ -9,8 +9,6 @@ import nu.marginalia.wmsa.configuration.module.ConfigurationModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.server.Initialization; -import java.io.IOException; - public class EdgeIndexMain extends MainClass { private final EdgeIndexService service; @@ -23,7 +21,7 @@ public class EdgeIndexMain extends MainClass { init(ServiceDescriptor.EDGE_INDEX, args); Injector injector = Guice.createInjector( - new EdgeTablesModule(), + new EdgeIndexTablesModule(), new EdgeIndexModule(), new DatabaseModule(), new ConfigurationModule() diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index 3249241d..6cb5ba36 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -8,7 +8,7 @@ import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexDomainQueryService; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService; @@ -26,9 +26,7 @@ public class EdgeIndexService extends Service { @NotNull private final Initialization init; - private final SearchIndexes indexes; - - public static final int DYNAMIC_BUCKET_LENGTH = 7; + private final SearchIndexControl indexes; @Inject @@ -36,7 +34,7 @@ public class EdgeIndexService extends Service { @Named("service-port") Integer port, Initialization init, MetricsServer metricsServer, - SearchIndexes indexes, + SearchIndexControl indexes, EdgeIndexOpsService opsService, EdgeIndexLexiconService lexiconService, @@ -59,8 +57,7 @@ public class EdgeIndexService extends Service { Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson); Spark.post("/ops/repartition", opsService::repartitionEndpoint); - Spark.post("/ops/preconvert", opsService::preconvertEndpoint); - Spark.post("/ops/reindex/:id", opsService::reindexEndpoint); + Spark.post("/ops/reindex", opsService::reindexEndpoint); get("/is-blocked", this::isBlocked, gson::toJson); @@ -76,11 +73,9 @@ public class EdgeIndexService extends Service { if (!initialized) { init.waitReady(); initialized = true; + indexes.initialize(init); } - else { - return; - } - indexes.initialize(init); + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeTablesModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexTablesModule.java similarity index 95% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeTablesModule.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexTablesModule.java index 4650b15b..93014e4c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeTablesModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexTablesModule.java @@ -6,7 +6,7 @@ import nu.marginalia.wmsa.configuration.WmsaHome; import java.nio.file.Path; -public class EdgeTablesModule extends AbstractModule { +public class EdgeIndexTablesModule extends AbstractModule { public void configure() { bind(Path.class).annotatedWith(Names.named("partition-root-slow")).toInstance(WmsaHome.getDisk("index-write")); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index 0b516ba4..199418de 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -4,19 +4,22 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; import lombok.SneakyThrows; +import nu.marginalia.util.array.LongArray; import nu.marginalia.util.dict.DictionaryHashMap; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.postings.SearchIndex; +import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter; +import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexReader; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; +import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter; +import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader; +import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters; +import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,13 +28,8 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; -import java.util.EnumMap; -import java.util.HashMap; -import java.util.Map; import java.util.concurrent.Callable; -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; - @Singleton public class IndexServicesFactory { private final Path tmpFileDir; @@ -41,44 +39,55 @@ public class IndexServicesFactory { private final PartitionedDataFile writerIndexFile; private final RootDataFile keywordLexiconFile; - private final DoublePartitionedDataFile preconverterOutputFile; - private final DoublePartitionedDataFile indexReadWordsFile; - private final DoublePartitionedDataFile indexReadUrlsFile; - private final DoublePartitionedDataFile indexWriteWordsFile; - private final DoublePartitionedDataFile indexWriteUrlsFile; + private final PartitionedDataFile fwdIndexDocId; + private final PartitionedDataFile fwdIndexDocData; + private final PartitionedDataFile revIndexDoc; + private final PartitionedDataFile revIndexWords; + + private final PartitionedDataFile revPrioIndexDoc; + private final PartitionedDataFile revPrioIndexWords; + private volatile static KeywordLexicon keywordLexicon; private final Long dictionaryHashMapSize; - private final SearchIndexPartitioner partitioner; + private final Path searchSetsBase; + + int LIVE_PART = 0; + + int NEXT_PART = 1; @Inject public IndexServicesFactory( @Named("tmp-file-dir") Path tmpFileDir, @Named("partition-root-slow") Path partitionRootSlow, - @Named("partition-root-slow-tmp") Path partitionRootSlowTmp, @Named("partition-root-fast") Path partitionRootFast, - @Named("edge-writer-page-index-file") String writerIndexFile, - @Named("edge-writer-dictionary-file") String keywordLexiconFile, - @Named("edge-index-read-words-file") String indexReadWordsFile, - @Named("edge-index-read-urls-file") String indexReadUrlsFile, - @Named("edge-index-write-words-file") String indexWriteWordsFile, - @Named("edge-index-write-urls-file") String indexWriteUrlsFile, @Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize, - EdgeDomainBlacklist domainBlacklist, - SearchIndexPartitioner partitioner - ) { + EdgeDomainBlacklist domainBlacklist + ) throws IOException { this.tmpFileDir = tmpFileDir; this.dictionaryHashMapSize = dictionaryHashMapSize; this.domainBlacklist = domainBlacklist; - this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, writerIndexFile); - this.keywordLexiconFile = new RootDataFile(partitionRootSlow, keywordLexiconFile); - this.indexReadWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadWordsFile); - this.indexReadUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadUrlsFile); - this.indexWriteWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteWordsFile); - this.indexWriteUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteUrlsFile); - this.preconverterOutputFile = new DoublePartitionedDataFile(partitionRootSlowTmp, "preconverted.dat"); - this.partitioner = partitioner; + this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat"); + this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat"); + + fwdIndexDocId = new PartitionedDataFile(partitionRootFast, "fwd-doc-id.dat"); + fwdIndexDocData = new PartitionedDataFile(partitionRootFast, "fwd-doc-data.dat"); + + revIndexDoc = new PartitionedDataFile(partitionRootFast, "rev-doc.dat"); + revIndexWords = new PartitionedDataFile(partitionRootFast, "rev-words.dat"); + + revPrioIndexDoc = new PartitionedDataFile(partitionRootFast, "rev-prio-doc.dat"); + revPrioIndexWords = new PartitionedDataFile(partitionRootFast, "rev-prio-words.dat"); + + searchSetsBase = partitionRootSlow.resolve("search-sets"); + if (!Files.isDirectory(searchSetsBase)) { + Files.createDirectory(searchSetsBase); + } + } + + public Path getSearchSetsBase() { + return searchSetsBase; } public SearchIndexJournalWriterImpl getIndexWriter(int idx) { @@ -89,8 +98,7 @@ public class IndexServicesFactory { public KeywordLexicon getKeywordLexicon() { if (keywordLexicon == null) { final var journal = new KeywordLexiconJournal(keywordLexiconFile.get()); - keywordLexicon = new KeywordLexicon(journal, - new DictionaryHashMap(dictionaryHashMapSize)); + keywordLexicon = new KeywordLexicon(journal, new DictionaryHashMap(dictionaryHashMapSize)); } return keywordLexicon; } @@ -101,85 +109,105 @@ public class IndexServicesFactory { } - public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { - var converter = new SearchIndexConverter(block, id, tmpFileDir, - preconverterOutputFile.get(id, block), - indexWriteWordsFile.get(id, block), - indexWriteUrlsFile.get(id, block), - partitioner, - domainBlacklist - ); + public void convertIndex() throws IOException { + convertForwardIndex(); + convertFullReverseIndex(); + convertPriorityReverseIndex(); + + + } + + private void convertFullReverseIndex() throws IOException { + + logger.info("Converting full reverse index"); + + var longArray = LongArray.mmapRead(writerIndexFile.get(0).toPath()); + var journalReader = new SearchIndexJournalReaderSingleFile(longArray); + var converter = new ReverseIndexConverter(tmpFileDir, + journalReader, + revIndexWords.get(NEXT_PART).toPath(), + revIndexDoc.get(NEXT_PART).toPath()); + converter.convert(); } - @SneakyThrows - public SearchIndexPreconverter getIndexPreconverter() { - Map shards = new HashMap<>(); + private void convertPriorityReverseIndex() throws IOException { - for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) { - for (IndexBlock block : IndexBlock.values()) { - shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block)); - } - } + logger.info("Converting priority reverse index"); - return new SearchIndexPreconverter(writerIndexFile.get(0), - shards, - partitioner, - domainBlacklist + + var longArray = LongArray.mmapRead(writerIndexFile.get(0).toPath()); + + var journalReader = new SearchIndexJournalReaderSingleFile(longArray, null, ReverseIndexPriorityParameters::filterPriorityRecord); + + var converter = new ReverseIndexConverter(tmpFileDir, + journalReader, + revPrioIndexWords.get(NEXT_PART).toPath(), + revPrioIndexDoc.get(NEXT_PART).toPath()); + + converter.convert(); + } + + private void convertForwardIndex() throws IOException { + logger.info("Converting forward index data"); + + new ForwardIndexConverter(tmpFileDir, + writerIndexFile.get(0), + fwdIndexDocId.get(NEXT_PART).toPath(), + fwdIndexDocData.get(NEXT_PART).toPath()) + .convert(); + } + + + public ReverseIndexReader getReverseIndexReader() throws IOException { + return new ReverseIndexReader( + revIndexWords.get(LIVE_PART).toPath(), + revIndexDoc.get(LIVE_PART).toPath()); + } + public ReverseIndexPrioReader getReverseIndexPrioReader() throws IOException { + return new ReverseIndexPrioReader( + revPrioIndexWords.get(LIVE_PART).toPath(), + revPrioIndexDoc.get(LIVE_PART).toPath()); + } + public ForwardIndexReader getForwardIndexReader() throws IOException { + return new ForwardIndexReader( + fwdIndexDocId.get(LIVE_PART).toPath(), + fwdIndexDocData.get(LIVE_PART).toPath() ); } - private File getPreconverterOutputFile(int index, IndexBlock block) { - return preconverterOutputFile.get(index, block); - } - - @SneakyThrows - public SearchIndexReader getIndexReader(int id) { - EnumMap indexMap = new EnumMap<>(IndexBlock.class); - for (IndexBlock block : IndexBlock.values()) { - try { - indexMap.put(block, createSearchIndex(id, block)); - } - catch (Exception ex) { - logger.error("Could not create index {}-{} ({})", id, block, ex.getMessage()); - } - } - return new SearchIndexReader(indexMap); - } - - private SearchIndex createSearchIndex(int bucketId, IndexBlock block) { - try { - return new SearchIndex("IndexReader"+bucketId+":"+ block.name(), - indexReadUrlsFile.get(bucketId, block), - indexReadWordsFile.get(bucketId, block)); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - public Callable switchFilesJob(int id) { + public Callable switchFilesJob() { return () -> { - for (var block : IndexBlock.values()) { - if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) && - Files.exists(indexWriteUrlsFile.get(id, block).toPath())) { - Files.move( - indexWriteWordsFile.get(id, block).toPath(), - indexReadWordsFile.get(id, block).toPath(), - StandardCopyOption.REPLACE_EXISTING); - Files.move( - indexWriteUrlsFile.get(id, block).toPath(), - indexReadUrlsFile.get(id, block).toPath(), - StandardCopyOption.REPLACE_EXISTING); - } - } + switchFile(revIndexDoc.get(NEXT_PART).toPath(), revIndexDoc.get(LIVE_PART).toPath()); + switchFile(revIndexWords.get(NEXT_PART).toPath(), revIndexWords.get(LIVE_PART).toPath()); + + switchFile(revPrioIndexDoc.get(NEXT_PART).toPath(), revPrioIndexDoc.get(LIVE_PART).toPath()); + switchFile(revPrioIndexWords.get(NEXT_PART).toPath(), revPrioIndexWords.get(LIVE_PART).toPath()); + + switchFile(fwdIndexDocId.get(NEXT_PART).toPath(), fwdIndexDocId.get(LIVE_PART).toPath()); + switchFile(fwdIndexDocData.get(NEXT_PART).toPath(), fwdIndexDocData.get(LIVE_PART).toPath()); return true; }; } - public EdgeIndexBucket createIndexBucket(int id) { - return new EdgeIndexBucket(this, new EdgeIndexControl(this), id); + public void switchFile(Path from, Path to) throws IOException { + if (Files.exists(from)) { + Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); + } + } + + public SearchIndex createIndexBucket() { + return new SearchIndex(this, new EdgeIndexControl(this)); + } + + public SearchIndexReader getSearchIndexReader() throws IOException { + return new SearchIndexReader( + getForwardIndexReader(), + getReverseIndexReader(), + getReverseIndexPrioReader() + ); } } @@ -214,29 +242,5 @@ class PartitionedDataFile { } return partitionDir.resolve(pattern).toFile(); } -} - -class DoublePartitionedDataFile { - private final Path partition; - private final String pattern; - - DoublePartitionedDataFile(Path partition, String pattern) { - this.partition = partition; - this.pattern = pattern; - } - - public File get(Object id, Object id2) { - Path partitionDir = partition.resolve(id.toString()); - - if (!partitionDir.toFile().exists()) { - partitionDir.toFile().mkdir(); - } - partitionDir = partitionDir.resolve(id2.toString()); - if (!partitionDir.toFile().exists()) { - partitionDir.toFile().mkdir(); - } - - return partitionDir.resolve(pattern).toFile(); - } } \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java index 157e3fb8..32094fd9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java @@ -8,6 +8,7 @@ import nu.marginalia.wmsa.client.AbstractDynamicClient; import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; @@ -34,6 +35,7 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW @Override public void putWords(Context ctx, EdgeId domain, EdgeId url, + EdgePageDocumentsMetadata metadata, DocumentKeywords wordSet, int writer ) { @@ -42,10 +44,10 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW IndexPutKeywordsReq.newBuilder() .setDomain(domain.id()) .setUrl(url.id()) + .setMetadata(metadata.encode()) .setIndex(writer); var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder(); - wordSetBuilder.setIndex(wordSet.block().ordinal()); wordSetBuilder.addAllWords(List.of(wordSet.keywords())); for (var meta : wordSet.metadata()) { wordSetBuilder.addMeta(meta); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java index 43ec70f7..00e518e3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java @@ -3,15 +3,16 @@ package nu.marginalia.wmsa.edge.index.client; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; -import nu.marginalia.util.ListChunker; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; @@ -43,6 +44,7 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient { } public void putWords(Context ctx, EdgeId domain, EdgeId url, + EdgePageDocumentsMetadata metadata, DocumentKeywords wordSet, int writer) { if (wordSet.keywords().length == 0) return; @@ -52,10 +54,10 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient { return; } - for (var chunk : ListChunker.chopList(wordSet, SearchIndexJournalEntry.MAX_LENGTH)) { + for (var chunk : KeywordListChunker.chopList(wordSet, SearchIndexJournalEntry.MAX_LENGTH)) { var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata())); - var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block()); + var header = new SearchIndexJournalEntryHeader(domain, url, metadata.encode()); indexWriter.put(header, entry); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java index 81623535..ff405e7a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java @@ -2,12 +2,13 @@ package nu.marginalia.wmsa.edge.index.client; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; public interface EdgeIndexWriterClient extends AutoCloseable { - void putWords(Context ctx, EdgeId domain, EdgeId url, + void putWords(Context ctx, EdgeId domain, EdgeId url, EdgePageDocumentsMetadata metadata, DocumentKeywords wordSets, int writer); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java deleted file mode 100644 index 2242f476..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion; - -public class ConversionUnnecessaryException extends Exception { - public ConversionUnnecessaryException() { - - } - - @Override - public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java deleted file mode 100644 index 220a9708..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java +++ /dev/null @@ -1,56 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion; - -import gnu.trove.list.TIntList; -import gnu.trove.map.hash.TIntIntHashMap; -import gnu.trove.set.hash.TIntHashSet; - -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; - -public class SearchEngineRanking { - - private final TIntIntHashMap domainToId - = new TIntIntHashMap(1_000_000, 0.5f, -1, Integer.MAX_VALUE); - - private final TIntHashSet[] domainToBucket = new TIntHashSet[DYNAMIC_BUCKET_LENGTH+1]; - - private final int offset; - private final double[] limits; - - public SearchEngineRanking(int offset, TIntList domains, double... limits) { - this.offset = offset; - this.limits = limits; - - for (int i = offset; i < offset+limits.length; i++) { - domainToBucket[i] = new TIntHashSet(100, 0.5f, DYNAMIC_BUCKET_LENGTH); - } - - for (int i = 0; i < domains.size(); i++) { - double relPortion = i / (double) domains.size(); - - for (int limit = 0; limit < limits.length; limit++) { - if (relPortion < limits[limit]) { - domainToBucket[limit+offset].add(domains.get(i)); - break; - } - } - - domainToId.put(domains.get(i), i); - } - } - - public boolean ownsBucket(int bucketId) { - return bucketId >= offset && bucketId < offset + limits.length; - } - - public boolean hasBucket(int bucket, int domain) { - var set = domainToBucket[bucket]; - if (set == null) { - return false; - } - return set.contains(domain); - } - - public int translateId(int id) { - return domainToId.get(id); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java deleted file mode 100644 index d78ef51a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java +++ /dev/null @@ -1,223 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion; - -import nu.marginalia.util.RandomWriteFunnel; -import nu.marginalia.util.btree.BTreeWriter; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable; -import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; - -public class SearchIndexConverter { - public static final int ENTRY_URL_OFFSET = 0; - public static final int ENTRY_METADATA_OFFSET = 1; - public static final int ENTRY_SIZE = 2; - - public static final BTreeContext urlsBTreeContext = new BTreeContext(5, ENTRY_SIZE, ~0, 8); - - private final long[] tmpWordsBuffer = SearchIndexJournalReader.createAdequateTempBuffer(); - - private final Path tmpFileDir; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final IndexBlock block; - private final int bucketId; - - private final File inputFile; - private final File outputFileWords; - private final File outputFileUrls; - - private final SearchIndexPartitioner partitioner; - private final EdgeDomainBlacklist blacklist; - - private final static int internalSortLimit = - Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256; - - public SearchIndexConverter(IndexBlock block, - int bucketId, - Path tmpFileDir, - File inputFile, - File outputFileWords, - File outputFileUrls, - SearchIndexPartitioner partitioner, - EdgeDomainBlacklist blacklist) - { - this.block = block; - this.bucketId = bucketId; - this.tmpFileDir = tmpFileDir; - this.inputFile = inputFile; - this.outputFileWords = outputFileWords; - this.outputFileUrls = outputFileUrls; - this.partitioner = partitioner; - this.blacklist = blacklist; - } - - public void convert() throws IOException { - Files.deleteIfExists(outputFileWords.toPath()); - Files.deleteIfExists(outputFileUrls.toPath()); - - SearchIndexJournalReader journalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath())); - - if (journalReader.fileHeader.fileSize() <= SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES) { - return; - } - - logger.info("Converting {} ({}) {} {}", block.ordinal(), block, inputFile, journalReader.fileHeader); - - var lock = partitioner.getReadLock(); - try { - lock.lock(); - - var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - - logger.info("Creating word index table {} for block {}", outputFileWords, block.ordinal()); - WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords); - - logger.info("Creating word urls table {} for block {}", outputFileUrls, block.ordinal()); - createUrlTable(journalReader, tmpUrlsFile, wordIndexTable); - - Files.delete(tmpUrlsFile); - } - catch (IOException ex) { - logger.error("Failed to convert", ex); - throw ex; - } - finally { - lock.unlock(); - } - } - - private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader, - File outputFileWords) throws IOException - { - final int topWord = (int) journalReader.fileHeader.wordCount(); - - WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord); - - for (var entry : journalReader) { - if (!isRelevantEntry(entry)) { - continue; - } - - final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); - - for (var record : entryData) { - int wordId = record.wordId(); - if (wordId < 0 || wordId >= topWord) { - logger.warn("Bad word {}", record); - } - wordsTableWriter.acceptWord(wordId); - } - } - - wordsTableWriter.write(outputFileWords); - - return wordsTableWriter.getTable(); - } - - private void createUrlTable(SearchIndexJournalReader journalReader, - Path tmpUrlsFile, - WordIndexOffsetsTable wordOffsetsTable) throws IOException - { - long numberOfWordsTotal = 0; - for (var entry : journalReader) { - if (isRelevantEntry(entry)) - numberOfWordsTotal += entry.wordCount(); - } - - try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); - FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) { - - try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, ENTRY_SIZE * numberOfWordsTotal, 10_000_000)) { - int[] wordWriteOffset = new int[wordOffsetsTable.length()]; - - for (var entry : journalReader) { - if (!isRelevantEntry(entry)) continue; - - var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); - - for (var record : entryData) { - int wordId = record.wordId(); - long metadata = record.metadata(); - - if (wordId >= wordWriteOffset.length) { - logger.warn("Overflowing wordId {}", wordId); - continue; - } - - if (wordId < 0) { - logger.warn("Negative wordId {}", wordId); - } - - final long urlInternal = translateUrl(entry.docId()); - - long offset; - if (wordId > 0) offset = wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]; - else offset = wordWriteOffset[wordId]; - - rwf.put(offset + ENTRY_URL_OFFSET, urlInternal); - rwf.put(offset + ENTRY_METADATA_OFFSET, metadata); - - wordWriteOffset[wordId] += ENTRY_SIZE; - } - } - - rwf.write(urlsTmpFileChannel); - } - - urlsTmpFileChannel.force(false); - - try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) { - if (wordOffsetsTable.length() > 0) { - var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit, ENTRY_SIZE); - - wordOffsetsTable.forEachRange(urlTmpFileSorter::sortRange); - - urlsTmpFileMap.force(); - } else { - logger.warn("urls table empty -- nothing to sort"); - } - } - - try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) { - var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); - - wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> { - // Note: The return value is accumulated into accumulatorIdx! - - return writer.write(accumulatorIdx, length/ENTRY_SIZE, - slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length)); - }); - - } catch (Exception e) { - logger.error("Error while writing BTree", e); - } - - } - } - - private long translateUrl(long url) { - int domainId = partitioner.translateId(bucketId, (int) (url >>> 32)); - return ((long)domainId << 32) | (url & 0xFFFFFFFFL); - } - - private boolean isRelevantEntry(SearchIndexJournalReader.JournalEntry entry) { - return block.equals(entry.header.block()) - && !blacklist.isBlacklisted(entry.domainId()) - && partitioner.filterUnsafe(entry.domainId(), bucketId); - } - -} - diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java deleted file mode 100644 index 8d7f19eb..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java +++ /dev/null @@ -1,167 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import gnu.trove.set.hash.TIntHashSet; -import lombok.SneakyThrows; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; - -@Singleton -public class SearchIndexPartitioner { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final PartitionSet partitionSet; - - private SearchEngineRanking retroRanking = null; - private SearchEngineRanking smallWebRanking = null; - private SearchEngineRanking standardRanking = null; - private SearchEngineRanking specialDomainRanking = null; - private SearchEngineRanking academiaRanking = null; - - private volatile TIntHashSet goodUrls; - - private final SearchIndexDao dao; - private final ReadWriteLock rwl = new ReentrantReadWriteLock(); - - @Inject - public SearchIndexPartitioner(SearchIndexDao dao) { - this.dao = dao; - - if (null == dao) { - partitionSet = this::yesFilter; - } - else { - partitionSet = this::byPartitionTable; - } - } - - public boolean isBusy() { - var readLock = rwl.readLock(); - try { - return !readLock.tryLock(); - } - finally { - readLock.unlock(); - } - } - - public void reloadPartitions() { - if (dao == null) { - logger.info("No dao = no partition table"); - return; - } - - logger.info("Fetching URLs"); - - if (goodUrls != null) { - goodUrls.clear(); - } - goodUrls = dao.goodUrls(); - - logger.info("Fetching domains"); - - var retroDomains = dao.getRetroDomains(); - var smallWebDomains = dao.getSmallWebDomains(); - var academiaDomains = dao.getAcademiaDomains(); - var standardDomains = dao.getStandardDomains(); - var specialDomains = dao.getSpecialDomains(); - - logger.info("Got {} retro domains", retroDomains.size()); - logger.info("Got {} small domains", smallWebDomains.size()); - logger.info("Got {} academia domains", academiaDomains.size()); - logger.info("Got {} standard domains", standardDomains.size()); - logger.info("Got {} special domains", specialDomains.size()); - - var lock = rwl.writeLock(); - try { - lock.lock(); - retroRanking = new SearchEngineRanking(0, retroDomains, 0.2, 1); - smallWebRanking = new SearchEngineRanking(2, smallWebDomains, 0.15); - academiaRanking = new SearchEngineRanking(3, academiaDomains, 1); - standardRanking = new SearchEngineRanking(4, standardDomains, 0.2, 1); - specialDomainRanking = new SearchEngineRanking(6, specialDomains, 1); - logger.info("Finished building partitions table"); - } - finally { - lock.unlock(); - } - } - - public boolean isGoodUrl(int urlId) { - if (goodUrls == null) - return true; - return goodUrls.contains(urlId); - } - - private boolean yesFilter(int domainId, int bucketId) { - return true; - } - private boolean byPartitionTable(int domainId, int bucketId) { - if (retroRanking.hasBucket(bucketId, domainId)) - return true; - if (smallWebRanking.hasBucket(bucketId, domainId)) - return true; - if (academiaRanking.hasBucket(bucketId, domainId)) - return true; - if (specialDomainRanking.hasBucket(bucketId, domainId)) - return true; - - if (standardRanking.hasBucket(bucketId, domainId)) - return true; - - return DYNAMIC_BUCKET_LENGTH == bucketId; - } - - @SneakyThrows - public Lock getReadLock() { - return rwl.readLock(); - } - public boolean filterUnsafe(int domainId, int bucketId) { - return partitionSet.test(domainId, bucketId); - } - - @Deprecated - public boolean filter(int domainId, int bucketId) { - var lock = rwl.readLock(); - try { - lock.lock(); - return partitionSet.test(domainId, bucketId); - } - finally { - lock.unlock(); - } - } - - public int translateId(int bucketId, int id) { - if (retroRanking != null && retroRanking.ownsBucket(bucketId)) { - return retroRanking.translateId(id); - } - if (smallWebRanking != null && smallWebRanking.ownsBucket(bucketId)) { - return smallWebRanking.translateId(id); - } - if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) { - return academiaRanking.translateId(id); - } - if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) { - return specialDomainRanking.translateId(id); - } - - // standard gets passed traight through - if (standardRanking != null && standardRanking.ownsBucket(bucketId)) { - return id; - } - - return id; - } - - interface PartitionSet { - boolean test(int domainId, int bucketId); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java deleted file mode 100644 index 4b16a817..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java +++ /dev/null @@ -1,117 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion; - -import com.google.inject.Inject; -import gnu.trove.set.hash.TIntHashSet; -import lombok.SneakyThrows; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.util.Map; -import java.util.Objects; - -public class SearchIndexPreconverter { - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public record Shard(int bucket, int block) {} - - @SneakyThrows - @Inject - public SearchIndexPreconverter(File inputFile, - Map outputFiles, - SearchIndexPartitioner partitioner, - EdgeDomainBlacklist blacklist) - { - TIntHashSet spamDomains = blacklist.getSpamDomains(); - logger.info("Preconverting {}", inputFile); - - for (File f : outputFiles.values()) { - if (f.exists()) { - Files.deleteIfExists(Objects.requireNonNull(f).toPath()); - } - } - - SearchIndexJournalReader indexJournalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath())); - - final long wordCountOriginal = indexJournalReader.fileHeader.wordCount(); - - logger.info("{}", indexJournalReader.fileHeader); - - ShardOutput[] outputs = outputFiles.entrySet().stream() - .map(entry -> ShardOutput.fromFile(entry.getKey(), entry.getValue())) - .toArray(ShardOutput[]::new); - - var lock = partitioner.getReadLock(); - try { - lock.lock(); - ByteBuffer buffer = ByteBuffer.allocateDirect(65536); - for (var entry : indexJournalReader) { - if (!partitioner.isGoodUrl(entry.urlId()) - || spamDomains.contains(entry.domainId())) { - continue; - } - - buffer.clear(); - entry.copyToBuffer(buffer); - - for (ShardOutput output : outputs) { - if (output.shouldWrite(partitioner, entry)) { - buffer.flip(); - - output.write(buffer); - } - } - } - } - finally { - lock.unlock(); - } - logger.info("Finalizing preconversion"); - - for (ShardOutput output : outputs) { - output.finish(wordCountOriginal); - } - } - - private record ShardOutput(Shard shard, RandomAccessFile raf, FileChannel fc) { - public static ShardOutput fromFile(Shard s, File f) { - try { - var v = new RandomAccessFile(f, "rw"); - v.seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES); - return new ShardOutput(s, v, v.getChannel()); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) { - return shard.block == entry.header.block().ordinal() - && partitioner.filterUnsafe(entry.domainId(), shard.bucket); - } - - public void finish(long wordCountOriginal) throws IOException { - long pos = raf.getFilePointer(); - raf.seek(0); - raf.writeLong(pos); - raf.writeLong(wordCountOriginal); - fc.force(true); - fc.close(); - raf.close(); - } - - public void write(ByteBuffer buffer) throws IOException { - while (buffer.position() < buffer.limit()) - fc.write(buffer); - } - }; - -} - diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java deleted file mode 100644 index 464e9388..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion.words; - -public class WordIndexLengthsTable { - final long[] table; - - public WordIndexLengthsTable(int size) { - this.table = new long[size]; - } - public void increment(int idx) { table[idx]++; } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java deleted file mode 100644 index 7a601a4f..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java +++ /dev/null @@ -1,67 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion.words; - -import java.io.IOException; - -public class WordIndexOffsetsTable { - final long[] table; - public final int numberOfUsedWords; - - public WordIndexOffsetsTable(long[] table, int numberOfUsedWords) { - - this.table = table; - this.numberOfUsedWords = numberOfUsedWords; - } - - public int length() { - return table.length; - } - - public void forEachRange(OffsetTableEntryConsumer o) throws IOException { - if (table[0] > 0) { - o.accept(0, (int) table[0]); - } - - for (int i = 1; i < table.length; i++) { - long start = table[i-1]; - long end = table[i]; - - if (start != end) { - o.accept(start, end); - } - } - } - - /** - * Fold over each span in the file, left to right, accumulating the return value - */ - public long foldRanges(OffsetTableEntryFoldConsumer o) throws IOException { - long total = 0; - - if (table[0] > 0) { - total = o.accept(total,0, (int) table[0]); - } - - for (int i = 1; i < table.length; i++) { - long start = table[i-1]; - int length = (int) (table[i] - start); - - if (length != 0) { - total += o.accept(total, start, length); - } - } - - return total; - } - - public long get(int i) { - return table[i]; - } - - public interface OffsetTableEntryConsumer { - void accept(long start, long end) throws IOException; - } - - public interface OffsetTableEntryFoldConsumer { - long accept(long accumulator, long start, int length) throws IOException; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java deleted file mode 100644 index 2056948b..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java +++ /dev/null @@ -1,56 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion.words; - -/** Contains a stateful table of word index offsets, initially in lengths mode - * where the table contains how many postings exist for each word; then in offsets - * mode, where the lengths are converted into the necessary offsets for each block - * of document data. - * - * Caveat! This uses the same underlying array to conserve space. - * - */ -public class WordIndexTables { - private WordIndexLengthsTable lengthsTable; - private WordIndexOffsetsTable offsetsTable; - - private boolean converted = false; - - public WordIndexTables(int size) { - lengthsTable = new WordIndexLengthsTable(size); - } - - public WordIndexLengthsTable lengths() { - if (converted) throw new IllegalStateException("Table has been converted"); - - return lengthsTable; - } - - public WordIndexOffsetsTable offsets() { - if (!converted) throw new IllegalStateException("Table has not been converted"); - - return offsetsTable; - } - - public void convert() { - if (converted) throw new IllegalStateException("Table has been converted"); - - // Go from lengths to offsets, i.e. - // BEFORE: 1, 2, 1, 3, 0, 2 - // AFTER: 1, 3, 4, 7, 7, 9 - - long[] table = lengthsTable.table; - int numberOfUsedWords = 0; - - if (table[0] != 0) numberOfUsedWords = 1; - - for (int i = 1; i < table.length; i++) { - if (table[i] != 0) { - numberOfUsedWords++; - } - table[i] += table[i-1]; - } - - lengthsTable = null; - offsetsTable = new WordIndexOffsetsTable(table, numberOfUsedWords); - converted = true; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java deleted file mode 100644 index d3e54d19..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java +++ /dev/null @@ -1,79 +0,0 @@ -package nu.marginalia.wmsa.edge.index.conversion.words; - -import nu.marginalia.util.btree.BTreeWriter; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.util.multimap.MultimapFileLongSlice; -import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; - -import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_SIZE; -import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext; - -public class WordsTableWriter { - private final WordIndexTables table; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8); - - public WordsTableWriter(int length) { - table = new WordIndexTables(length); - } - - public void acceptWord(int wordId) { - for (int i = 0; i < ENTRY_SIZE; i++) { - table.lengths().increment(wordId); - } - } - - public WordIndexOffsetsTable getTable() { - return table.offsets(); - } - - public void write(File file) throws IOException { - table.convert(); - - logger.info("Writing table - {} max", table.offsets().numberOfUsedWords); - - final int tableSize = table.offsets().numberOfUsedWords; - - try (var mmf = MultimapFileLong.forOutput(file.toPath(), tableSize/8L)) { - mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal()); - long offset = 1; - - var writer = new BTreeWriter(mmf, wordsBTreeContext); - - writer.write(offset, tableSize, this::writeBTreeDataBlock); - } - } - - private void writeBTreeDataBlock(MultimapFileLongSlice mapSlice) { - long urlFileOffset = 0; - int idx = 0; - - var offsetTable = table.offsets().table; - - if (offsetTable[0] != 0) { - int length = (int) offsetTable[0]; - mapSlice.put(idx++, (long)length<<32); - mapSlice.put(idx++, 0); - - urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE)); - } - - for (int i = 1; i < offsetTable.length; i++) { - final int length = (int)(offsetTable[i] - offsetTable[i-1]); - - if (length > 0) { - mapSlice.put(idx++, (long)length << 32 | i); - mapSlice.put(idx++, urlFileOffset); - - urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE)); - } - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java deleted file mode 100644 index 65448755..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java +++ /dev/null @@ -1,132 +0,0 @@ -package nu.marginalia.wmsa.edge.index.journal; - -import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.util.multimap.MultimapFileLongSlice; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalFileHeader; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import org.jetbrains.annotations.NotNull; - -import java.nio.ByteBuffer; -import java.util.Iterator; - -import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.ENTRY_SIZE; -import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH; -import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; - -public class SearchIndexJournalReader implements Iterable { - public static final long FILE_HEADER_SIZE_LONGS = 2; - public static final long FILE_HEADER_SIZE_BYTES = 8*FILE_HEADER_SIZE_LONGS; - - public final SearchIndexJournalFileHeader fileHeader; - - private final MultimapFileLongSlice map; - private final long committedSize; - - public static long[] createAdequateTempBuffer() { - return new long[MAX_LENGTH*ENTRY_SIZE]; - } - - public SearchIndexJournalReader(MultimapFileLong map) { - fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1)); - committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS; - - map.advice(NativeIO.Advice.Sequential); - - this.map = map.atOffset(FILE_HEADER_SIZE_LONGS); - } - - @NotNull - @Override - public Iterator iterator() { - return new JournalEntryIterator(); - } - - private class JournalEntryIterator implements Iterator { - private JournalEntry entry; - - @Override - public boolean hasNext() { - if (entry == null) { - return committedSize > 0; - } - - return entry.hasNext(); - } - - @Override - public JournalEntry next() { - if (entry == null) { - entry = new JournalEntry(0); - } - else { - entry = entry.next(); - } - return entry; - } - } - - public class JournalEntry { - private final long offset; - public final SearchIndexJournalEntryHeader header; - - JournalEntry(long offset) { - final long sizeBlock = map.get(offset); - final long docId = map.get(offset + 1); - - this.offset = offset; - this.header = new SearchIndexJournalEntryHeader( - (int)(sizeBlock >>> 32L), - docId, - IndexBlock.byId((int)(sizeBlock & 0xFFFF_FFFFL))); - } - - public boolean hasNext() { - return nextId() < committedSize; - } - public long docId() { - return header.documentId(); - } - public int domainId() { - return (int) (docId() >>> 32L); - } - public int urlId() { - return (int)(docId() & 0xFFFF_FFFFL); - } - public IndexBlock block() { - return header.block(); - } - public int wordCount() { return header.entrySize() / ENTRY_SIZE; } - - public SearchIndexJournalEntry readEntry() { - long[] dest = new long[header.entrySize()]; - map.read(dest, offset + HEADER_SIZE_LONGS); - return new SearchIndexJournalEntry(header.entrySize(), dest); - } - - public SearchIndexJournalEntry readEntryUsingBuffer(long[] dest) { - if (dest.length >= header.entrySize()) { - map.read(dest, header.entrySize(), offset + HEADER_SIZE_LONGS); - return new SearchIndexJournalEntry(header.entrySize(), dest); - } - else { - return readEntry(); - } - } - - public long nextId() { - return offset + HEADER_SIZE_LONGS + header.entrySize(); - } - public JournalEntry next() { return new JournalEntry(nextId()); } - - public void copyToBuffer(ByteBuffer buffer) { - var dest = buffer.asLongBuffer(); - dest.position(buffer.position() * 8); - dest.limit(buffer.position()*8 + header.entrySize() + HEADER_SIZE_LONGS); - map.read(dest, offset); - buffer.position(dest.limit()*8); - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java deleted file mode 100644 index bad8d4e7..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java +++ /dev/null @@ -1,13 +0,0 @@ -package nu.marginalia.wmsa.edge.index.journal; - -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; - -public interface SearchIndexJournalWriter { - void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry); - - void forceWrite(); - - void flushWords(); - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java index 80b37191..24ca03b1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java @@ -121,7 +121,6 @@ public class KeywordLexiconJournalFile implements AutoCloseable { journalFileRAF.seek(pos); } - private final ByteBuffer writeBuffer = ByteBuffer.allocateDirect(4096); public void writeEntriesToJournal(List data) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentFlags.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentFlags.java new file mode 100644 index 00000000..0f7a68fa --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentFlags.java @@ -0,0 +1,37 @@ +package nu.marginalia.wmsa.edge.index.model; + +import java.util.EnumSet; + +public enum EdgePageDocumentFlags { + /** Simple processing was done, this document should be de-prioritized as a search result */ + Simple, + + UnusedBit1, + UnusedBit2, + UnusedBit3, + UnusedBit4, + UnusedBit5, + UnusedBit6, + UnusedBit7, + ; + + public int asBit() { + return 1 << ordinal(); + } + + public boolean isPresent(long value) { + return (asBit() & value) > 0; + } + + public static EnumSet decode(long encodedValue) { + EnumSet ret = EnumSet.noneOf(EdgePageDocumentFlags.class); + + for (EdgePageDocumentFlags f : values()) { + if ((encodedValue & f.asBit()) > 0) { + ret.add(f); + } + } + + return ret; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java new file mode 100644 index 00000000..4847d9fc --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java @@ -0,0 +1,117 @@ +package nu.marginalia.wmsa.edge.index.model; + +import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDate; + +import java.util.EnumSet; +import java.util.Set; + +import static java.lang.Math.max; +import static java.lang.Math.min; + +public record EdgePageDocumentsMetadata(int encSize, + int topology, + int year, + int sets, + int quality, + byte flags) { + + + public static final long ENCSIZE_MASK = 0xFFL; + public static final int ENCSIZE_SHIFT = 48; + public static final int ENCSIZE_MULTIPLIER = 50; + public static final long TOPOLOGY_MASK = 0xFFL; + + public static final int TOPOLOGY_SHIFT = 32; + + public static final long YEAR_MASK = 0xFFL; + public static final int YEAR_SHIFT = 24; + + public static final long SETS_MASK = 0xFL; + public static final int SETS_SHIFT = 16; + + public static final long QUALITY_MASK = 0xFL; + public static final int QUALITY_SHIFT = 8; + + public static long defaultValue() { + return 0L; + } + public EdgePageDocumentsMetadata() { + this(defaultValue()); + } + public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet flags) { + this(0, topology, year, sets, quality, encodeFlags(flags)); + } + + public EdgePageDocumentsMetadata withSize(int size) { + if (size <= 0) { + return this; + } + + final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER)); + + return new EdgePageDocumentsMetadata(encSize, topology, year, sets, quality, flags); + } + + private static byte encodeFlags(Set flags) { + byte ret = 0; + for (var flag : flags) { ret |= flag.asBit(); } + return ret; + } + + public boolean hasFlag(EdgePageDocumentFlags flag) { + return (flags & flag.asBit()) != 0; + } + + public EdgePageDocumentsMetadata(long value) { + this( (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK), + (int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK), + (int) ((value >>> YEAR_SHIFT) & YEAR_MASK), + (int) ((value >>> SETS_SHIFT) & SETS_MASK), + (int) ((value >>> QUALITY_SHIFT) & QUALITY_MASK), + (byte) (value & 0xFF) + ); + } + + public static boolean hasFlags(long encoded, long metadataBitMask) { + return (encoded & metadataBitMask) == encoded; + } + + public long encode() { + long ret = 0; + ret |= Byte.toUnsignedLong(flags); + ret |= min(QUALITY_MASK, max(0, quality)) << QUALITY_SHIFT; + ret |= min(SETS_MASK, max(0, sets)) << SETS_SHIFT; + ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT; + ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT; + ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT; + + return ret; + } + + public boolean isEmpty() { + return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0; + } + + public static int decodeQuality(long encoded) { + return (int) ((encoded >>> QUALITY_SHIFT) & QUALITY_MASK); + } + + public static long decodeTopology(long encoded) { + return (int) ((encoded >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK); + } + public static int decodeYear(long encoded) { + + return PubDate.fromYearByte((int) ((encoded >>> YEAR_SHIFT) & YEAR_MASK)); + } + + public int size() { + return ENCSIZE_MULTIPLIER * encSize; + } + + public static int decodeSize(long encoded) { + return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK); + } + + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java index 848cc870..a3f443c5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java @@ -1,14 +1,40 @@ package nu.marginalia.wmsa.edge.index.model; +import nu.marginalia.util.language.processing.KeywordCounter; +import nu.marginalia.util.language.processing.NameCounter; +import nu.marginalia.util.language.processing.SubjectCounter; +import nu.marginalia.wmsa.edge.converting.processor.SiteWords; + import java.util.EnumSet; public enum EdgePageWordFlags { + + /** Word appears in title */ Title, + + /** Word appears to be the subject in several sentences + * @see SubjectCounter */ Subjects, + + /** Word has high tf-idf + * @see KeywordCounter */ + TfIdfHigh, + + /** Word is a likely named object. This is a weaker version of Subjects. + * @see NameCounter */ NamesWords, + + Synthetic, + + /** Word is important to site + * @see SiteWords + */ Site, - SiteAdjacent, - Simple; + + /** Word is important to adjacent documents + * @see SiteWords + * */ + SiteAdjacent; public int asBit() { return 1 << ordinal(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java index 84a907f0..579dda92 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java @@ -3,83 +3,94 @@ package nu.marginalia.wmsa.edge.index.model; import nu.marginalia.util.BrailleBlockPunchCards; import java.util.EnumSet; +import java.util.Set; import static java.lang.Math.max; import static java.lang.Math.min; public record EdgePageWordMetadata(int tfIdf, int positions, - int quality, int count, - EnumSet flags) { - - // If flags are moved from the least significant end of - // this struct, then EntrySourceFromBTree will break. + byte flags) { public static final long COUNT_MASK = 0xFL; public static final int COUNT_SHIFT = 8; - public static final long QUALITY_MASK = 0xFL; - public static final int QUALITY_SHIFT = 12; - public static final long TF_IDF_MASK = 0xFFFFL; public static final int TF_IDF_SHIFT = 16; public static final int POSITIONS_SHIFT = 32; + public EdgePageWordMetadata() { + this(emptyValue()); + } + public EdgePageWordMetadata(long value) { this( (int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK), (int)(value >>> POSITIONS_SHIFT), - (int)((value >>> QUALITY_SHIFT) & QUALITY_MASK), (int)((value >>> COUNT_SHIFT) & COUNT_MASK), - EdgePageWordFlags.decode(value) + (byte) (value & 0xFF) ); } - public static int decodeQuality(long encoded) { - return (int)((encoded >>> QUALITY_SHIFT) & QUALITY_MASK); + public EdgePageWordMetadata(int tfIdf, + int positions, + int count, + Set flags) + { + this(tfIdf, positions, count, encodeFlags(flags)); + } + + private static byte encodeFlags(Set flags) { + byte ret = 0; + for (var flag : flags) { ret |= flag.asBit(); } + return ret; } public static boolean hasFlags(long encoded, long metadataBitMask) { - return (encoded & metadataBitMask) == encoded; + return (encoded & metadataBitMask) == metadataBitMask; + } + public static boolean hasAnyFlags(long encoded, long metadataBitMask) { + return (encoded & metadataBitMask) != 0; + } + public static int decodePositions(long meta) { + return (int) (meta >>> POSITIONS_SHIFT); + } + + public static double decodeTfidf(long meta) { + return (meta >>> TF_IDF_SHIFT) & TF_IDF_MASK; + } + + public boolean hasFlag(EdgePageWordFlags flag) { + return (flags & flag.asBit()) != 0; } public String toString() { StringBuilder sb = new StringBuilder(getClass().getSimpleName()); sb.append('[') .append("tfidf=").append(tfIdf).append(", ") - .append("quality=").append(quality).append(", ") .append("count=").append(count).append(", ") .append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']'); sb.append(", flags=").append(flags).append(']'); return sb.toString(); } - /* Encoded in a 64 bit long as - 0-8 flags - 8-12 count, - 12-16 quality, - 16-32 tf-idf [0, 65536] - 32-64 position mask + /* Encoded in a 64 bit long */ public long encode() { long ret = 0; - for (var flag : flags) { - ret |= flag.asBit(); - } - + ret |= Byte.toUnsignedLong(flags); ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT; ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT; - ret |= min(QUALITY_MASK, max(0, quality)) << QUALITY_SHIFT; ret |= ((long)(positions)) << POSITIONS_SHIFT; return ret; } public boolean isEmpty() { - return count == 0 && positions == 0 && flags.isEmpty() && tfIdf == 0; + return count == 0 && positions == 0 && flags == 0 && tfIdf == 0; } public static long emptyValue() { @@ -87,4 +98,7 @@ public record EdgePageWordMetadata(int tfIdf, } + public EnumSet flagSet() { + return EdgePageWordFlags.decode(flags); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java deleted file mode 100644 index 108b4be2..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ /dev/null @@ -1,36 +0,0 @@ -package nu.marginalia.wmsa.edge.index.model; - -public enum IndexBlock { - Title(IndexBlockType.PAGE_DATA), - Meta(IndexBlockType.PAGE_DATA), - - Words_1(IndexBlockType.PAGE_DATA), - Words_2(IndexBlockType.PAGE_DATA), - Words_4(IndexBlockType.PAGE_DATA), - Words_8(IndexBlockType.PAGE_DATA), - Words_16Plus(IndexBlockType.PAGE_DATA), - - Link(IndexBlockType.QUALITY_SIGNAL), - Site(IndexBlockType.QUALITY_SIGNAL), - - Artifacts(IndexBlockType.PAGE_DATA), - - Tfidf_High(IndexBlockType.TRANSIENT), - Subjects(IndexBlockType.TRANSIENT) - ; - - public final IndexBlockType type; - - IndexBlock(IndexBlockType type) { - this.type = type; - } - - // This is kind of a hot method, and Enum.values() allocates a new - // array each call. - private static final IndexBlock[] values = IndexBlock.values(); - public static IndexBlock byId(int id) { - return values[id]; - } -} - - diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlockType.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlockType.java deleted file mode 100644 index 9ee6fc49..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlockType.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.wmsa.edge.index.model; - -public enum IndexBlockType { - /** This block is only used for joins */ - QUALITY_SIGNAL, - /** This block contains page keywords */ - PAGE_DATA, - /** This block is only used for generation */ - TRANSIENT -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java new file mode 100644 index 00000000..7f3632c1 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java @@ -0,0 +1,7 @@ +package nu.marginalia.wmsa.edge.index.model; + +public enum QueryStrategy { + SENTENCE, + TOPIC, + AUTO +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/EdgeIndexQuerySearchTerms.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/EdgeIndexQuerySearchTerms.java new file mode 100644 index 00000000..df4ff2c7 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/EdgeIndexQuerySearchTerms.java @@ -0,0 +1,25 @@ +package nu.marginalia.wmsa.edge.index.postings; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntComparator; +import it.unimi.dsi.fastutil.ints.IntList; +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; + +public record EdgeIndexQuerySearchTerms(IntList includes, IntList excludes, IntList priority) { + public EdgeIndexQuerySearchTerms() { + this(IntList.of(), IntList.of(), IntList.of()); + } + + public boolean isEmpty() { + return includes.isEmpty(); + } + + public int[] sortedDistinctIncludes(IntComparator comparator) { + if (includes.isEmpty()) + return includes.toIntArray(); + + IntList list = new IntArrayList(new IntOpenHashSet(includes)); + list.sort(comparator); + return list.toIntArray(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexMetadataService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexMetadataService.java new file mode 100644 index 00000000..6a359d0b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexMetadataService.java @@ -0,0 +1,22 @@ +package nu.marginalia.wmsa.edge.index.postings; + +public class IndexMetadataService { + private final SearchIndexControl indexes; + + public IndexMetadataService(SearchIndexControl indexes) { + this.indexes = indexes; + } + + public long getDocumentMetadata(long urlId) { + return indexes.getIndex().getDocumentMetadata(urlId); + } + + public int getDomainId(long urlId) { + return indexes.getIndex().getDomainId(urlId); + } + + public long[] getTermMetadata(int termId, long[] docIdsAll) { + return indexes.getIndex().getTermMetadata(termId, docIdsAll); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java new file mode 100644 index 00000000..462e4c7a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java @@ -0,0 +1,193 @@ +package nu.marginalia.wmsa.edge.index.postings; + +import gnu.trove.list.TLongList; +import gnu.trove.map.hash.TObjectIntHashMap; +import gnu.trove.set.hash.TLongHashSet; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; + +import java.util.List; +import java.util.Objects; + +public class IndexResultValuator { + private final IndexMetadataService metadataService; + private final List> searchTermVariants; + private final int[] termIdsAll; + + private final TLongHashSet resultsWithPriorityTerms; + + private final TObjectIntHashMap termToId = new TObjectIntHashMap<>(10, 0.75f, -1); + private final TermMetadata termMetadata; + + public IndexResultValuator(SearchIndexControl indexes, TLongList results, List subqueries) { + this.metadataService = new IndexMetadataService(indexes); + this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); + + var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader()); + IntArrayList termIdsList = new IntArrayList(); + + searchTermVariants.stream().flatMap(List::stream).distinct().forEach(term -> { + int id = lexiconReader.get(term); + + if (id >= 0) { + termIdsList.add(id); + termToId.put(term, id); + } + }); + + final long[] resultsArray = results.toArray(); + + termIdsAll = termIdsList.toArray(new int[0]); + termMetadata = new TermMetadata(resultsArray, termIdsAll); + + int[] priorityTermIds = + subqueries.stream() + .flatMap(sq -> sq.searchTermsPriority.stream()) + .distinct() + .mapToInt(lexiconReader::get) + .filter(id -> id >= 0) + .toArray(); + + resultsWithPriorityTerms = new TLongHashSet(results.size()); + for (int priorityTerm : priorityTermIds) { + long[] metadata = metadataService.getTermMetadata(priorityTerm, resultsArray); + for (int i = 0; i < metadata.length; i++) { + if (metadata[i] != 0) resultsWithPriorityTerms.add(resultsArray[i]); + } + } + + + } + + public EdgeSearchResultItem evaluateResult(long id) { + + EdgeSearchResultItem searchResult = new EdgeSearchResultItem(id); + final long urlIdInt = searchResult.getUrlIdInt(); + + searchResult.setDomainId(metadataService.getDomainId(urlIdInt)); + + long docMetadata = metadataService.getDocumentMetadata(urlIdInt); + + double bestScore = 0; + for (int querySetId = 0; querySetId < searchTermVariants.size(); querySetId++) { + bestScore = Math.min(bestScore, + evaluateSubquery(searchResult, + docMetadata, + querySetId, + searchTermVariants.get(querySetId)) + ); + } + + if (resultsWithPriorityTerms.contains(id)) { + bestScore -= 50; + } + + searchResult.setScore(bestScore); + + return searchResult; + } + + private double evaluateSubquery(EdgeSearchResultItem searchResult, + long docMetadata, + int querySetId, + List termList) + { + double setScore = 0; + int setSize = 0; + + for (int termIdx = 0; termIdx < termList.size(); termIdx++) { + String searchTerm = termList.get(termIdx); + + final int termId = termToId.get(searchTerm); + + long metadata = termMetadata.getTermMetadata(termId, searchResult.getUrlIdInt()); + + EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore( + querySetId, + searchTerm, + metadata, + docMetadata, + resultsWithPriorityTerms.contains(searchResult.combinedId) + ); + searchResult.scores.add(score); + + setScore += score.termValue(); + + if (termIdx == 0) { + setScore += score.documentValue(); + } + + setSize++; + } + + setScore += calculateTermCoherencePenalty(searchResult.getUrlIdInt(), termToId, termList); + + return setScore/setSize; + } + + private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap termToId, List termList) { + long maskDirect = ~0; + long maskAdjacent = ~0; + + final int flagBitMask = EdgePageWordFlags.Title.asBit() + | EdgePageWordFlags.Subjects.asBit() + | EdgePageWordFlags.Synthetic.asBit(); + + for (String term : termList) { + var meta = termMetadata.getTermMetadata(termToId.get(term), urlId); + long positions; + + if (meta == 0) { + return 1000; + } + + positions = EdgePageWordMetadata.decodePositions(meta); + + if (!EdgePageWordMetadata.hasAnyFlags(meta, flagBitMask)) { + maskDirect &= positions; + maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); + } + } + + if (maskAdjacent == 0) { + return 40; + } + + if (maskDirect == 0) { + return 20; + } + + return Long.numberOfTrailingZeros(maskDirect)/5. - Long.bitCount(maskDirect); + } + + + class TermMetadata { + private final Long2LongOpenHashMap termdocToMeta; + + public TermMetadata(long[] docIdsAll, int[] termIdsList) { + termdocToMeta = new Long2LongOpenHashMap(docIdsAll.length * termIdsAll.length, 0.5f); + + for (int term : termIdsList) { + var metadata = metadataService.getTermMetadata(term, docIdsAll); + for (int i = 0; i < docIdsAll.length; i++) { + termdocToMeta.put(termdocKey(term, docIdsAll[i]), metadata[i]); + } + } + + } + + public long getTermMetadata(int termId, long docId) { + return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0); + } + } + + private long termdocKey(int termId, long docId) { + return (docId << 32) | termId; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java new file mode 100644 index 00000000..6d70fab6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java @@ -0,0 +1,150 @@ +package nu.marginalia.wmsa.edge.index.postings; + +import nu.marginalia.wmsa.edge.index.EdgeIndexControl; +import nu.marginalia.wmsa.edge.index.IndexServicesFactory; +import nu.marginalia.wmsa.edge.index.query.IndexQuery; +import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; +import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator; +import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepFromPredicate; +import org.jetbrains.annotations.NotNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collections; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.LongPredicate; + +public class SearchIndex { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private volatile SearchIndexReader indexReader; + + private final ReadWriteLock indexReplacementLock = new ReentrantReadWriteLock(); + + @NotNull + private final IndexServicesFactory servicesFactory; + private final EdgeIndexControl indexControl; + + public SearchIndex(@NotNull IndexServicesFactory servicesFactory, EdgeIndexControl indexControl) { + this.servicesFactory = servicesFactory; + this.indexControl = indexControl; + } + + public void init() { + Lock lock = indexReplacementLock.writeLock(); + + try { + lock.lock(); + logger.info("Initializing bucket"); + + if (indexReader == null) { + indexReader = servicesFactory.getSearchIndexReader(); + } + } + catch (Exception ex) { + logger.error("Uncaught exception", ex); + } + finally { + lock.unlock(); + } + } + + public boolean switchIndex() throws IOException { + + indexControl.regenerateIndex(); + + Lock lock = indexReplacementLock.writeLock(); + try { + lock.lock(); + + indexControl.switchIndexFiles(); + + indexReader = servicesFactory.getSearchIndexReader(); + } + catch (Exception ex) { + logger.error("Uncaught exception", ex); + } + finally { + lock.unlock(); + } + + return true; + } + + + public boolean isAvailable() { + return indexReader != null; + } + + public IndexQuery getQuery(EdgeIndexQuerySearchTerms terms, IndexQueryParams params, LongPredicate includePred) { + + if (null == indexReader) { + logger.warn("Index reader not ready"); + return new IndexQuery(Collections.emptyList()); + } + + final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords); + + SearchIndexReader.IndexQueryBuilder query = + switch(params.queryStrategy()) { + case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes); + case TOPIC -> indexReader.findWordAsTopic(orderedIncludes); + case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes); + }; + + if (query == null) { + return new IndexQuery(Collections.emptyList()); + } + + query.addInclusionFilter(new QueryFilterStepFromPredicate(includePred)); + + for (int i = 0; i < orderedIncludes.length; i++) { + query = query.also(orderedIncludes[i]); + } + + for (int term : terms.excludes()) { + query = query.not(term); + } + + // Run these last, as they'll worst-case cause as many page faults as there are + // items in the buffer + query.addInclusionFilter(indexReader.filterForParams(params)); + + return query.build(); + } + + private int compareKeywords(int a, int b) { + return Long.compare( + indexReader.numHits(a), + indexReader.numHits(b) + ); + } + + + public IndexQuery getDomainQuery(int wordId, IndexResultDomainDeduplicator localFilter) { + throw new UnsupportedOperationException(""); // TBI + /* + var query = indexReader.findDomain(wordId); + + query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue)); + + return query;*/ + } + + /** Replaces the values of ids with their associated metadata, or 0L if absent */ + public long[] getTermMetadata(int termId, long[] docs) { + return indexReader.getMetadata(termId, docs); + } + + public long getDocumentMetadata(long docId) { + return indexReader.getDocumentMetadata(docId); + } + + public int getDomainId(long docId) { + return indexReader.getDomainId(docId); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java new file mode 100644 index 00000000..a1475af3 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java @@ -0,0 +1,75 @@ +package nu.marginalia.wmsa.edge.index.postings; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.wmsa.edge.index.IndexServicesFactory; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; +import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +@Singleton +public class SearchIndexControl { + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final IndexServicesFactory servicesFactory; + private final SearchIndexJournalWriterImpl primaryIndexWriter; + private final SearchIndexJournalWriterImpl secondaryIndexWriter; + private volatile KeywordLexiconReadOnlyView keywordLexiconReadOnlyView; + + private final SearchIndex index; + private final EdgeOpsLockService opsLockService; + + @Inject + public SearchIndexControl(IndexServicesFactory servicesFactory, + EdgeOpsLockService opsLockService) { + this.servicesFactory = servicesFactory; + + this.primaryIndexWriter = servicesFactory.getIndexWriter(0); + this.secondaryIndexWriter = servicesFactory.getIndexWriter(1); + + index = servicesFactory.createIndexBucket(); + this.opsLockService = opsLockService; + } + + public boolean reindex() throws Exception { + return opsLockService.run(index::switchIndex).isPresent(); + } + + public boolean isBusy() { + return opsLockService.isLocked(); + } + + @Nullable + public KeywordLexiconReadOnlyView getLexiconReader() { + return keywordLexiconReadOnlyView; + } + + public void initialize(Initialization init) { + + logger.info("Waiting for init"); + init.waitReady(); + + if (!opsLockService.run(index::init)) throw new IllegalStateException("Failed to initialize " + getClass().getSimpleName()); + keywordLexiconReadOnlyView = servicesFactory.getDictionaryReader(); + } + + public SearchIndexJournalWriterImpl getIndexWriter(int idx) { + if (idx == 0) { + return primaryIndexWriter; + } + else { + return secondaryIndexWriter; + } + } + + public SearchIndex getIndex() { + return index; + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexReader.java new file mode 100644 index 00000000..f19063ca --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexReader.java @@ -0,0 +1,121 @@ +package nu.marginalia.wmsa.edge.index.postings; + +import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexReader; +import nu.marginalia.wmsa.edge.index.postings.forward.ParamMatchingQueryFilter; +import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader; +import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader; +import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; +import nu.marginalia.wmsa.edge.index.query.EntrySource; +import nu.marginalia.wmsa.edge.index.query.IndexQuery; +import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; +import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +public class SearchIndexReader { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final ForwardIndexReader forwardIndexReader; + private final ReverseIndexReader reverseIndexReader; + private final ReverseIndexPrioReader reverseIndexPrioReader; + + public SearchIndexReader(ForwardIndexReader forwardIndexReader, + ReverseIndexReader reverseIndexReader, + ReverseIndexPrioReader reverseIndexPrioReader) { + this.forwardIndexReader = forwardIndexReader; + this.reverseIndexReader = reverseIndexReader; + this.reverseIndexPrioReader = reverseIndexPrioReader; + } + + public IndexQueryBuilder findWordAsSentence(int[] wordIdsByFrequency) { + List entrySources = new ArrayList<>(1); + + entrySources.add(reverseIndexReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER)); + + return new IndexQueryBuilder(new IndexQuery(entrySources)); + } + + public IndexQueryBuilder findWordAsTopic(int[] wordIdsByFrequency) { + List entrySources = new ArrayList<>(wordIdsByFrequency.length); + + for (int wordId : wordIdsByFrequency) { + entrySources.add(reverseIndexPrioReader.priorityDocuments(wordId)); + } + + return new IndexQueryBuilder(new IndexQuery(entrySources)); + } + + public IndexQueryBuilder findWordTopicDynamicMode(int[] wordIdsByFrequency) { + if (wordIdsByFrequency.length > 3) { + return findWordAsSentence(wordIdsByFrequency); + } + + List entrySources = new ArrayList<>(wordIdsByFrequency.length + 1); + + for (int wordId : wordIdsByFrequency) { + entrySources.add(reverseIndexPrioReader.priorityDocuments(wordId)); + } + + entrySources.add(reverseIndexReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER)); + + return new IndexQueryBuilder(new IndexQuery(entrySources)); + } + + QueryFilterStepIf filterForParams(IndexQueryParams params) { + return new ParamMatchingQueryFilter(params, forwardIndexReader); + } + @SneakyThrows + public long numHits(int word) { + return reverseIndexReader.numDocuments(word); + } + + public long[] getMetadata(int wordId, long[] docIds) { + return reverseIndexReader.getTermMeta(wordId, docIds); + } + + public long getDocumentMetadata(long docId) { + return forwardIndexReader.getDocMeta(docId); + } + + public int getDomainId(long docId) { + return forwardIndexReader.getDomainId(docId); + } + + public class IndexQueryBuilder { + private final IndexQuery query; + + IndexQueryBuilder(IndexQuery query) { + this.query = query; + } + + public IndexQueryBuilder also(int termId) { + + query.addInclusionFilter(reverseIndexReader.also(termId)); + + return this; + } + + public IndexQueryBuilder not(int termId) { + + query.addInclusionFilter(reverseIndexReader.not(termId)); + + return this; + } + + public IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep) { + + query.addInclusionFilter(filterStep); + + return this; + } + + public IndexQuery build() { + return query; + } + + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java new file mode 100644 index 00000000..5edb4fea --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java @@ -0,0 +1,109 @@ +package nu.marginalia.wmsa.edge.index.postings.forward; + +import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; +import org.roaringbitmap.IntConsumer; +import org.roaringbitmap.RoaringBitmap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*; + +public class ForwardIndexConverter { + private static final int RWF_BIN_SIZE = 10_000_000; + + private final Path tmpFileDir; + private final File inputFile; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final Path outputFileDocsId; + private final Path outputFileDocsData; + + + public ForwardIndexConverter(Path tmpFileDir, + File inputFile, + Path outputFileDocsId, + Path outputFileDocsData + ) { + this.tmpFileDir = tmpFileDir; + this.inputFile = inputFile; + this.outputFileDocsId = outputFileDocsId; + this.outputFileDocsData = outputFileDocsData; + } + + public void convert() throws IOException { + deleteOldFiles(); + + SearchIndexJournalReaderSingleFile journalReader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(inputFile.toPath())); + if (journalReader.fileHeader.fileSize() <= SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES) { + return; + } + + logger.info("Converting {} {}",inputFile, journalReader.fileHeader); + + final Path intermediateDocsFile = Files.createTempFile(tmpFileDir, "words-sorted", ".dat"); + + try { + LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); + + // doc ids -> sorted list of ids + + logger.info("Gathering Offsets"); + Long2IntOpenHashMap docIdToIdx = new Long2IntOpenHashMap((int) docsFileId.size()); + docsFileId.forEach(0, docsFileId.size(), (pos, val) -> docIdToIdx.put(val, (int) pos)); + + // docIdToIdx -> file offset for id + + logger.info("Creating Supplementary Indexes"); + + LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ENTRY_SIZE * docsFileId.size()); + + journalReader.forEach(entry -> { + long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId()); + + docFileData.set(entryOffset + METADATA_OFFSET, entry.docMeta()); + docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId()); + }); + + docFileData.force(); + + + } catch (IOException ex) { + logger.error("Failed to convert", ex); + throw ex; + } + finally { + Files.deleteIfExists(intermediateDocsFile); + } + } + + private LongArray getDocIds(Path outputFileDocs, SearchIndexJournalReader journalReader) throws IOException { + RoaringBitmap rbm = new RoaringBitmap(); + journalReader.forEachUrlId(rbm::add); + + LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getCardinality()); + rbm.forEach(new IntConsumer() { + int offset; + @Override + public void accept(int value) { + ret.set(offset++, value); + } + }); + return ret; + } + + private void deleteOldFiles() throws IOException { + Files.deleteIfExists(outputFileDocsId); + Files.deleteIfExists(outputFileDocsData); + } + +} + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexParameters.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexParameters.java new file mode 100644 index 00000000..f019a40b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexParameters.java @@ -0,0 +1,9 @@ +package nu.marginalia.wmsa.edge.index.postings.forward; + +class ForwardIndexParameters { + public static final int ENTRY_SIZE = 2; + + public static final int DOMAIN_OFFSET = 0; + public static final int METADATA_OFFSET = 1; + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java new file mode 100644 index 00000000..7b4c66ff --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java @@ -0,0 +1,92 @@ +package nu.marginalia.wmsa.edge.index.postings.forward; + +import com.upserve.uppend.blobs.NativeIO; +import gnu.trove.map.hash.TLongIntHashMap; +import nu.marginalia.util.array.LongArray; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*; + +public class ForwardIndexReader { + private final TLongIntHashMap ids; + private final LongArray data; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public ForwardIndexReader(Path idsFile, Path dataFile) throws IOException { + if (!Files.exists(dataFile) || + !Files.exists(idsFile) + ) { + ids = null; + data = null; + return; + } + + logger.info("Switching forward index"); + + var idsArray = LongArray.mmapRead(idsFile); + idsArray.advice(NativeIO.Advice.Sequential); + + ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1); + + // This hash table should be of the same size as the number of documents, so typically less than 1 Gb + idsArray.forEach(0, idsArray.size(), (pos, val) -> { + ids.put(val, (int) pos); + }); + + data = LongArray.mmapRead(dataFile); + + + data.advice(NativeIO.Advice.Random); + } + + private int idxForDoc(long docId) { + return ids.get(docId); + } + + public long getDocMeta(long docId) { + long offset = idxForDoc(docId); + if (offset < 0) return 0; + + return data.get(ENTRY_SIZE * offset + METADATA_OFFSET); + } + public int getDomainId(long docId) { + long offset = idxForDoc(docId); + if (offset < 0) return 0; + + return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET)); + } + + public DocPost docPost(long docId) { + return new DocPost(idxForDoc(docId)); + } + + + public class DocPost { + private final long idx; + + public DocPost(int idx) { + this.idx = idx; + } + + public long meta() { + + if (idx < 0) + return 0; + + return data.get(ENTRY_SIZE * idx + METADATA_OFFSET); + } + + public int domainId() { + if (idx < 0) + return 0; + + return Math.max(0, (int) data.get(ENTRY_SIZE * idx + DOMAIN_OFFSET)); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java new file mode 100644 index 00000000..a3c30bab --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java @@ -0,0 +1,81 @@ +package nu.marginalia.wmsa.edge.index.postings.forward; + +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; +import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; +import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; +import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimitType; + +public class ParamMatchingQueryFilter implements QueryFilterStepIf { + private final IndexQueryParams params; + private final ForwardIndexReader forwardIndexReader; + + public ParamMatchingQueryFilter(IndexQueryParams params, ForwardIndexReader forwardIndexReader) { + this.params = params; + this.forwardIndexReader = forwardIndexReader; + } + + @Override + public boolean test(long docId) { + var post = forwardIndexReader.docPost(docId); + + if (!validateDomain(post)) { + return false; + } + + if (!validateQuality(post)) { + return false; + } + + if (!validateYear(post)) { + return false; + } + + if (!validateSize(post)) { + return false; + } + return true; + } + + private boolean validateDomain(ForwardIndexReader.DocPost post) { + return params.searchSet().contains(post.domainId()); + } + + private boolean validateQuality(ForwardIndexReader.DocPost post) { + final var limit = params.qualityLimit(); + + if (limit.type() == SpecificationLimitType.NONE) { + return true; + } + + final int quality = EdgePageDocumentsMetadata.decodeQuality(post.meta()); + + return limit.test(quality); + } + private boolean validateYear(ForwardIndexReader.DocPost post) { + if (params.year().type() == SpecificationLimitType.NONE) + return true; + + int postVal = EdgePageDocumentsMetadata.decodeYear(post.meta()); + + return params.year().test(postVal); + } + + private boolean validateSize(ForwardIndexReader.DocPost post) { + if (params.size().type() == SpecificationLimitType.NONE) + return true; + + int postVal = EdgePageDocumentsMetadata.decodeSize(post.meta()); + + return params.size().test(postVal); + } + + @Override + public double cost() { + return 32; + } + + @Override + public String describe() { + return getClass().getSimpleName(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalEntry.java similarity index 94% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalEntry.java index 0ea1325c..6fe28af3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalEntry.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.journal.model; +package nu.marginalia.wmsa.edge.index.postings.journal.model; import java.nio.ByteBuffer; import java.util.Arrays; @@ -56,7 +56,7 @@ public class SearchIndexJournalEntry implements Iterable domainId, EdgeId urlId, IndexBlock block) { - this(-1, combineIds(domainId, urlId), block); + public SearchIndexJournalEntryHeader( EdgeId domainId, EdgeId urlId, long documentMeta) { + this(-1, combineIds(domainId, urlId), documentMeta); } private static long combineIds(EdgeId domainId, EdgeId urlId) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalFileHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalFileHeader.java similarity index 56% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalFileHeader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalFileHeader.java index 62fea842..4b65505c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalFileHeader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalFileHeader.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.journal.model; +package nu.marginalia.wmsa.edge.index.postings.journal.model; public record SearchIndexJournalFileHeader(long fileSize, long wordCount) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalStatistics.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalStatistics.java new file mode 100644 index 00000000..8b827174 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/model/SearchIndexJournalStatistics.java @@ -0,0 +1,3 @@ +package nu.marginalia.wmsa.edge.index.postings.journal.model; + +public record SearchIndexJournalStatistics(int highestWord, int documentCardinality) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java new file mode 100644 index 00000000..40c2a433 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java @@ -0,0 +1,91 @@ +package nu.marginalia.wmsa.edge.index.postings.journal.reader; + +import nu.marginalia.util.array.LongArray; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; + +import java.nio.ByteBuffer; + +public class SearchIndexJournalReadEntry { + private final long offset; + public final SearchIndexJournalEntryHeader header; + private final LongArray map; + private final long committedSize; + + SearchIndexJournalReadEntry(long offset, LongArray map, long committedSize) { + this.map = map; + this.committedSize = committedSize; + final long sizeBlock = this.map.get(offset); + final long docId = this.map.get(offset + 1); + final long meta = this.map.get(offset + 2); + + this.offset = offset; + this.header = new SearchIndexJournalEntryHeader( + (int) (sizeBlock >>> 32L), + docId, + meta); + } + + public boolean hasNext() { + return nextId() < committedSize; + } + + public long docId() { + return header.documentId(); + } + + public long docMeta() { + return header.documentMeta(); + } + + public int domainId() { + return (int) (docId() >>> 32L); + } + + public int urlId() { + return (int) (docId() & 0xFFFF_FFFFL); + } + + public int wordCount() { + return header.entrySize() / SearchIndexJournalEntry.ENTRY_SIZE; + } + + public SearchIndexJournalEntry readEntry() { + long[] dest = new long[header.entrySize()]; + + long offsetStart = offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; + long offsetEnd = offsetStart + header.entrySize(); + + map.get(offsetStart, offsetEnd, dest); + + return new SearchIndexJournalEntry(header.entrySize(), dest); + } + + public SearchIndexJournalEntry readEntryUsingBuffer(long[] dest) { + if (dest.length >= header.entrySize()) { + long offsetStart = offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; + long offsetEnd = offsetStart + header.entrySize(); + + map.get(offsetStart, offsetEnd, dest); + return new SearchIndexJournalEntry(header.entrySize(), dest); + } else { + return readEntry(); + } + } + + public long nextId() { + return offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS + header.entrySize(); + } + + public SearchIndexJournalReadEntry next() { + return new SearchIndexJournalReadEntry(nextId(), map, committedSize); + } + + public void copyToBuffer(ByteBuffer buffer) { + var dest = buffer.asLongBuffer(); + dest.position(buffer.position() * 8); + dest.limit(buffer.position() * 8 + header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS); + map.get(offset, dest); + buffer.position(dest.limit() * 8); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java new file mode 100644 index 00000000..71811772 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java @@ -0,0 +1,49 @@ +package nu.marginalia.wmsa.edge.index.postings.journal.reader; + +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics; +import org.jetbrains.annotations.NotNull; + +import java.util.Iterator; +import java.util.function.IntConsumer; + +public interface SearchIndexJournalReader extends Iterable { + long FILE_HEADER_SIZE_LONGS = 2; + long FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS; + + default long[] createAdequateTempBuffer() { + return new long[SearchIndexJournalEntry.MAX_LENGTH * SearchIndexJournalEntry.ENTRY_SIZE]; + } + + SearchIndexJournalStatistics getStatistics(); + + void forEachWordId(IntConsumer consumer); + + void forEachUrlIdWordId(BiIntConsumer consumer); + + void forEachDocIdWordId(LongIntConsumer consumer); + + void forEachDocIdRecord(LongObjectConsumer consumer); + + void forEachUrlId(IntConsumer consumer); + + @NotNull + @Override + Iterator iterator(); + + interface BiIntConsumer { + void accept(int left, int right); + } + + interface LongIntConsumer { + void accept(long left, int right); + } + + interface LongObjectConsumer { + void accept(long left, T right); + } + + interface IntObjectConsumer { + void accept(int left, T right); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReaderSingleFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReaderSingleFile.java new file mode 100644 index 00000000..5519581d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReaderSingleFile.java @@ -0,0 +1,176 @@ +package nu.marginalia.wmsa.edge.index.postings.journal.reader; + +import com.upserve.uppend.blobs.NativeIO; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalFileHeader; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics; +import org.jetbrains.annotations.NotNull; +import org.roaringbitmap.longlong.Roaring64Bitmap; + +import java.io.IOException; +import java.util.Iterator; +import java.util.function.IntConsumer; +import java.util.function.Predicate; + +public class SearchIndexJournalReaderSingleFile implements SearchIndexJournalReader { + + public final SearchIndexJournalFileHeader fileHeader; + + private final LongArray map; + private final long committedSize; + + final Predicate entryPredicate; + final Predicate recordPredicate; + + public SearchIndexJournalReaderSingleFile(LongArray map) throws IOException { + fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1)); + committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS; + + map.advice(NativeIO.Advice.Sequential); + + this.map = map.shifted(FILE_HEADER_SIZE_LONGS); + this.recordPredicate = null; + this.entryPredicate = null; + } + + public SearchIndexJournalReaderSingleFile(LongArray map, Predicate entryPredicate, Predicate recordPredicate) throws IOException { + fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1)); + committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS; + + map.advice(NativeIO.Advice.Sequential); + + this.map = map.shifted(FILE_HEADER_SIZE_LONGS); + + this.recordPredicate = recordPredicate; + this.entryPredicate = entryPredicate; + } + + public boolean filter(SearchIndexJournalReadEntry entry) { + return entryPredicate == null || entryPredicate.test(entry); + } + + public boolean filter(SearchIndexJournalReadEntry entry, SearchIndexJournalEntry.Record record) { + return (entryPredicate == null || entryPredicate.test(entry)) + && (recordPredicate == null || recordPredicate.test(record)); + } + + @Override + public SearchIndexJournalStatistics getStatistics() { + int highestWord = 0; + final long[] tmpWordsBuffer = createAdequateTempBuffer(); + + // Docs cardinality is a candidate for a HyperLogLog + Roaring64Bitmap docsBitmap = new Roaring64Bitmap(); + + for (var entry : this) { + var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); + + if (filter(entry)) { + docsBitmap.addLong(entry.docId() & 0x0000_0000_FFFF_FFFFL); + + for (var item : entryData) { + if (filter(entry, item)) { + highestWord = Integer.max(item.wordId(), highestWord); + } + } + } + } + + return new SearchIndexJournalStatistics(highestWord, docsBitmap.getIntCardinality()); + } + + @Override + public void forEachWordId(IntConsumer consumer) { + final long[] tmpWordsBuffer = createAdequateTempBuffer(); + for (var entry : this) { + var data = entry.readEntryUsingBuffer(tmpWordsBuffer); + for (var post : data) { + if (filter(entry, post)) { + consumer.accept(post.wordId()); + } + } + } + } + + @Override + public void forEachUrlIdWordId(BiIntConsumer consumer) { + final long[] tmpWordsBuffer = createAdequateTempBuffer(); + for (var entry : this) { + var data = entry.readEntryUsingBuffer(tmpWordsBuffer); + + for (var post : data) { + if (filter(entry, post)) { + consumer.accept(entry.urlId(), post.wordId()); + } + } + } + } + + @Override + public void forEachDocIdWordId(LongIntConsumer consumer) { + final long[] tmpWordsBuffer = createAdequateTempBuffer(); + for (var entry : this) { + var data = entry.readEntryUsingBuffer(tmpWordsBuffer); + + for (var post : data) { + if (filter(entry, post)) { + consumer.accept(entry.docId(), post.wordId()); + } + } + } + } + + @Override + public void forEachDocIdRecord(LongObjectConsumer consumer) { + final long[] tmpWordsBuffer = createAdequateTempBuffer(); + for (var entry : this) { + var data = entry.readEntryUsingBuffer(tmpWordsBuffer); + + for (var post : data) { + if (filter(entry, post)) { + consumer.accept(entry.docId(), post); + } + } + } + } + @Override + public void forEachUrlId(IntConsumer consumer) { + for (var entry : this) { + if (filter(entry)) { + consumer.accept(entry.urlId()); + } + } + } + + @NotNull + @Override + public Iterator iterator() { + return new JournalEntryIterator(); + } + + private class JournalEntryIterator implements Iterator { + private SearchIndexJournalReadEntry entry; + + @Override + public boolean hasNext() { + if (entry == null) { + return committedSize > 0; + } + + return entry.hasNext(); + } + + @Override + public SearchIndexJournalReadEntry next() { + if (entry == null) { + entry = new SearchIndexJournalReadEntry(0, map, committedSize); + } + else { + entry = entry.next(); + } + return entry; + } + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriter.java new file mode 100644 index 00000000..7d765006 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriter.java @@ -0,0 +1,13 @@ +package nu.marginalia.wmsa.edge.index.postings.journal.writer; + +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; + +public interface SearchIndexJournalWriter { + void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry); + + void forceWrite(); + + void flushWords(); + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriterImpl.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriterImpl.java index 4e294707..b57a1ea1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/writer/SearchIndexJournalWriterImpl.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.index.journal; +package nu.marginalia.wmsa.edge.index.postings.journal.writer; import io.reactivex.rxjava3.disposables.Disposable; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,7 +26,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { private RandomAccessFile raf; private FileChannel channel; - public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*128*8*4; + public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*128*8*4 + 8 * SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; private final ByteBuffer byteBuffer; private long pos; @@ -83,8 +83,9 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { byteBuffer.clear(); byteBuffer.putInt(entryData.size()); - byteBuffer.putInt(header.block().ordinal()); + byteBuffer.putInt(0); // unused byteBuffer.putLong(header.documentId()); + byteBuffer.putLong(header.documentMeta()); entryData.write(byteBuffer); @@ -112,6 +113,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { @Override + @SneakyThrows public void flushWords() { lexicon.commitToDisk(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java new file mode 100644 index 00000000..bd32dd68 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java @@ -0,0 +1,220 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse; + +import nu.marginalia.util.RandomWriteFunnel; +import nu.marginalia.util.array.IntArray; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.algo.SortingContext; +import nu.marginalia.util.array.functional.LongBinaryIOOperation; +import nu.marginalia.util.array.functional.LongIOTransformer; +import nu.marginalia.util.array.functional.LongTransformer; +import nu.marginalia.util.btree.BTreeWriter; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +import static nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexParameters.ENTRY_SIZE; +import static nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexParameters.bTreeContext; + +public class ReverseIndexConverter { + private static final int RWF_BIN_SIZE = 10_000_000; + + private final Path tmpFileDir; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final SearchIndexJournalReaderSingleFile journalReader; + private final Path outputFileWords; + private final Path outputFileDocs; + + + public ReverseIndexConverter(Path tmpFileDir, + SearchIndexJournalReaderSingleFile journalReader, + Path outputFileWords, + Path outputFileDocs) { + this.tmpFileDir = tmpFileDir; + this.journalReader = journalReader; + this.outputFileWords = outputFileWords; + this.outputFileDocs = outputFileDocs; + } + + public void convert() throws IOException { + deleteOldFiles(); + + if (journalReader.fileHeader.fileSize() <= SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES) { + return; + } + + final SearchIndexJournalStatistics statistics = journalReader.getStatistics(); + + final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); + SortingContext sortingContext = new SortingContext(tmpFileDir, 64_000); + + try { + final long wordsFileSize = statistics.highestWord() + 1; + + logger.debug("Words file size: {}", wordsFileSize); + // Create a count of how many documents has contains each word + final LongArray wordsOffsets = LongArray.allocate(wordsFileSize); + + logger.info("Gathering Offsets"); + journalReader.forEachWordId(wordsOffsets::increment); + wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer()); + + // Construct an intermediate representation of the reverse documents index + try (FileChannel intermediateDocChannel = + (FileChannel) Files.newByteChannel(intermediateUrlsFile, + StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE)) + { + logger.info("Creating Intermediate Docs File"); + // Construct intermediate index + try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE)) + { + journalReader.forEachDocIdRecord(new IntermediateIndexConstructor(wordsOffsets, intermediateDocumentWriteFunnel)); + intermediateDocumentWriteFunnel.write(intermediateDocChannel); + } + intermediateDocChannel.force(false); + + logger.info("Sorting Intermediate Docs File"); + + // Sort each segment of the intermediate file + { + LongArray intermediateDocs = LongArray.mmapForWriting(intermediateUrlsFile); + wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> { + intermediateDocs.sortLargeSpanN(sortingContext, ENTRY_SIZE, s, e); + return e; + }); + intermediateDocs.force(); + } + + + logger.info("Sizing"); + + SizeEstimator sizeEstimator = new SizeEstimator(); + wordsOffsets.foldIO(0, 0, wordsOffsets.size(), sizeEstimator); + + logger.info("Finalizing Docs File"); + + LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size); + // Construct the proper reverse index + wordsOffsets.transformEachIO(0, wordsOffsets.size(), new CreateReverseIndexBTreeTransformer(finalDocs, intermediateDocChannel)); + wordsOffsets.write(outputFileWords); + + // Attempt to clean up before forcing (important disk space preservation) + Files.deleteIfExists(intermediateUrlsFile); + + wordsOffsets.force(); + finalDocs.force(); + logger.info("Done"); + } + + } catch (IOException ex) { + logger.error("Failed to convert", ex); + throw ex; + } finally { + Files.deleteIfExists(intermediateUrlsFile); + } + } + + private static class SizeEstimator implements LongBinaryIOOperation { + public long size = 0; + @Override + public long apply(long start, long end) throws IOException { + if (end == start) return end; + + size += bTreeContext.calculateSize((int) (end - start) / ENTRY_SIZE); + + return end; + } + } + + private void deleteOldFiles() throws IOException { + Files.deleteIfExists(outputFileWords); + Files.deleteIfExists(outputFileDocs); + } + + private static class CountToOffsetTransformer implements LongTransformer { + long offset = 0; + + @Override + public long transform(long pos, long count) { + return (offset += ENTRY_SIZE * count); + } + } + + private static class CreateReverseIndexBTreeTransformer implements LongIOTransformer { + private final BTreeWriter writer; + private final FileChannel intermediateChannel; + + long start = 0; + long writeOffset = 0; + + public CreateReverseIndexBTreeTransformer(LongArray urlsFileMap, FileChannel intermediateChannel) { + this.writer = new BTreeWriter(urlsFileMap, ReverseIndexParameters.bTreeContext); + this.intermediateChannel = intermediateChannel; + } + + @Override + public long transform(long pos, long end) throws IOException { + + assert (end - start) % ReverseIndexParameters.ENTRY_SIZE == 0; + + final int size = (int)(end - start) / ReverseIndexParameters.ENTRY_SIZE; + + if (size == 0) { + return -1; + } + + final long offsetForBlock = writeOffset; + + writeOffset += writer.write(writeOffset, size, + mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start) + ); + + start = end; + return offsetForBlock; + } + } + + private static class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer { + + private final LongArray wordRangeEnds; + private final IntArray wordRangeOffset; + private final RandomWriteFunnel documentsFile; + + public IntermediateIndexConstructor(LongArray wordRangeEnds, RandomWriteFunnel documentsFile) { + this.wordRangeEnds = wordRangeEnds; + this.wordRangeOffset = IntArray.allocate(wordRangeEnds.size()); + this.documentsFile = documentsFile; + } + + @Override + public void accept(long docId, SearchIndexJournalEntry.Record record) { + final long urlId = docId & 0xFFFF_FFFFL; + final int wordId = record.wordId(); + + long offset = startOfRange(wordId); + + documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), urlId); + documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata()); + + } + + private long startOfRange(int wordId) { + if (wordId == 0) return 0; + + return wordRangeEnds.get(wordId - 1); + } + + } + +} + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexParameters.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexParameters.java new file mode 100644 index 00000000..e38fa3b5 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexParameters.java @@ -0,0 +1,9 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse; + +import nu.marginalia.util.btree.model.BTreeContext; + +class ReverseIndexParameters { + public static final int ENTRY_SIZE = 2; + + public static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, 8); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrefixEntrySource.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrefixEntrySource.java new file mode 100644 index 00000000..0cae45ab --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrefixEntrySource.java @@ -0,0 +1,48 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse; + +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.btree.BTreeReader; +import nu.marginalia.wmsa.edge.index.query.EntrySource; + +import static java.lang.Math.min; + +public class ReverseIndexPrefixEntrySource implements EntrySource { + private final BTreeReader reader; + + int pos; + long endOffset; + + public ReverseIndexPrefixEntrySource(BTreeReader reader, long prefixStart, long prefixEnd) { + this.reader = reader; + + pos = 0; + endOffset = pos + (long) reader.numEntries(); + } + + @Override + public void skip(int n) { + pos += n; + } + + @Override + public void read(LongQueryBuffer buffer) { + buffer.end = min(buffer.end, (int)(endOffset - pos)); + + reader.readData(buffer.data, buffer.end, pos); + + pos += buffer.end; + + buffer.uniq(); + } + + @Override + public boolean hasMore() { + return pos < endOffset; + } + + @Override + public String toString() { + return String.format("BTreeRange.EntrySource(@" + pos + ": " + endOffset + ")"); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java new file mode 100644 index 00000000..26a39bcf --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java @@ -0,0 +1,65 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse; + +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.btree.BTreeReader; +import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySource; +import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; +import nu.marginalia.wmsa.edge.index.query.EmptyEntrySource; +import nu.marginalia.wmsa.edge.index.query.EntrySource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class ReverseIndexPrioReader { + private final LongArray words; + private final LongArray documents; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public ReverseIndexPrioReader(Path words, Path documents) throws IOException { + if (!Files.exists(words) || !Files.exists(documents)) { + this.words = null; + this.documents = null; + return; + } + + logger.info("Switching prio reverse index"); + + this.words = LongArray.mmapRead(words); + this.documents = LongArray.mmapRead(documents); + } + + public EntrySource priorityDocuments(int wordId) { + if (words == null) { + // index not loaded + return new EmptyEntrySource(); + } + + if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource(); + + long offset = words.get(wordId); + + if (offset < 0) return new EmptyEntrySource(); + + return new ReverseIndexEntrySource(createReaderNew(offset), ReverseIndexEntrySourceBehavior.DO_PREFER); + } + + public int numDocuments(int wordId) { + if (wordId < 0) + return 0; + + long offset = words.get(wordId); + + if (offset < 0) + return 0; + + return createReaderNew(offset).numEntries(); + } + + private BTreeReader createReaderNew(long offset) { + return new BTreeReader(documents, ReverseIndexParameters.bTreeContext, offset); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPriorityParameters.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPriorityParameters.java new file mode 100644 index 00000000..b6d6fb38 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPriorityParameters.java @@ -0,0 +1,21 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse; + +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; + +public class ReverseIndexPriorityParameters { + private static final long highPriorityFlags = EdgePageWordFlags.Title.asBit() + | EdgePageWordFlags.Subjects.asBit() + | EdgePageWordFlags.TfIdfHigh.asBit() + | EdgePageWordFlags.NamesWords.asBit() + | EdgePageWordFlags.Site.asBit() + | EdgePageWordFlags.SiteAdjacent.asBit(); + + public static boolean filterPriorityRecord(SearchIndexJournalEntry.Record record) { + long meta = record.metadata(); + + return (meta & highPriorityFlags) != 0; + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java new file mode 100644 index 00000000..5679c5be --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java @@ -0,0 +1,117 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse; + +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.btree.BTreeReader; +import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySource; +import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; +import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexRejectFilter; +import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexRetainFilter; +import nu.marginalia.wmsa.edge.index.query.EmptyEntrySource; +import nu.marginalia.wmsa.edge.index.query.EntrySource; +import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterLetThrough; +import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterNoPass; +import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; + +public class ReverseIndexReader { + private final LongArray words; + private final LongArray documents; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public ReverseIndexReader(Path words, Path documents) throws IOException { + if (!Files.exists(words) || !Files.exists(documents)) { + this.words = null; + this.documents = null; + return; + } + + logger.info("Switching reverse index"); + + this.words = LongArray.mmapRead(words); + this.documents = LongArray.mmapRead(documents); + } + + public boolean isWordInDoc(int wordId, long documentId) { + if (wordId < 0) { + return false; + } + + long offset = words.get(wordId); + + if (offset < 0) { + return false; + } + + return createReaderNew(offset).findEntry(documentId) >= 0; + } + + public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) { + if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource(); + + long offset = words.get(wordId); + + if (offset < 0) return new EmptyEntrySource(); + + return new ReverseIndexEntrySource(createReaderNew(offset), behavior); + } + + public QueryFilterStepIf also(int wordId) { + if (wordId < 0) return new QueryFilterNoPass(); + + long offset = words.get(wordId); + + if (offset < 0) return new QueryFilterNoPass(); + + return new ReverseIndexRetainFilter(createReaderNew(offset)); + } + + public QueryFilterStepIf not(int wordId) { + if (wordId < 0) return new QueryFilterLetThrough(); + + long offset = words.get(wordId); + + if (offset < 0) return new QueryFilterLetThrough(); + + return new ReverseIndexRejectFilter(createReaderNew(offset)); + } + + public int numDocuments(int wordId) { + if (wordId < 0) + return 0; + + long offset = words.get(wordId); + + if (offset < 0) + return 0; + + return createReaderNew(offset).numEntries(); + } + + private BTreeReader createReaderNew(long offset) { + return new BTreeReader(documents, ReverseIndexParameters.bTreeContext, offset); + } + + public long[] getTermMeta(int wordId, long[] docIds) { + if (wordId < 0) { + return new long[docIds.length]; + } + + long offset = words.get(wordId); + if (offset < 0) { + return new long[docIds.length]; + } + + Arrays.sort(docIds); + + var reader = createReaderNew(offset); + return reader.queryData(docIds, 1); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySource.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySource.java new file mode 100644 index 00000000..a47b134a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySource.java @@ -0,0 +1,72 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse.query; + +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.btree.BTreeReader; +import nu.marginalia.wmsa.edge.index.query.EntrySource; + +import static java.lang.Math.min; + +public class ReverseIndexEntrySource implements EntrySource { + private final BTreeReader reader; + + private static final int ENTRY_SIZE = 2; + + int pos; + int endOffset; + + private final ReverseIndexEntrySourceBehavior behavior; + + public ReverseIndexEntrySource(BTreeReader reader, ReverseIndexEntrySourceBehavior behavior) { + this.reader = reader; + this.behavior = behavior; + + pos = 0; + endOffset = pos + ENTRY_SIZE*reader.numEntries(); + } + + @Override + public void skip(int n) { + pos += n; + } + + @Override + public void read(LongQueryBuffer buffer) { + if (behavior == ReverseIndexEntrySourceBehavior.DO_NOT_PREFER + && buffer.hasRetainedData()) + { + pos = endOffset; + return; + } + + buffer.end = min(buffer.end, endOffset - pos); + + reader.readData(buffer.data, buffer.end, pos); + + pos += buffer.end; + + destagger(buffer); + buffer.uniq(); + } + + private void destagger(LongQueryBuffer buffer) { + if (ENTRY_SIZE == 1) + return; + + for (int ri = ENTRY_SIZE, wi=1; ri < buffer.end ; ri+=ENTRY_SIZE, wi++) { + buffer.data[wi] = buffer.data[ri]; + } + + buffer.end /= ENTRY_SIZE; + } + + @Override + public boolean hasMore() { + return pos < endOffset; + } + + @Override + public String toString() { + return String.format("BTreeRange.EntrySource(@" + pos + ": " + endOffset + ")"); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySourceBehavior.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySourceBehavior.java new file mode 100644 index 00000000..fc779403 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexEntrySourceBehavior.java @@ -0,0 +1,6 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse.query; + +public enum ReverseIndexEntrySourceBehavior { + DO_PREFER, + DO_NOT_PREFER +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRejectFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRejectFilter.java new file mode 100644 index 00000000..ca317349 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRejectFilter.java @@ -0,0 +1,28 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse.query; + +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.btree.BTreeReader; +import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; + +public record ReverseIndexRejectFilter(BTreeReader range) implements QueryFilterStepIf { + + @Override + public void apply(LongQueryBuffer buffer) { + range.rejectEntries(buffer); + buffer.finalizeFiltering(); + } + + public boolean test(long id) { + return range.findEntry(id) < 0; + } + + @Override + public double cost() { + return range.numEntries(); + } + + @Override + public String describe() { + return "ReverseIndexRejectFilter[]"; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRetainFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRetainFilter.java new file mode 100644 index 00000000..9c408a34 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/query/ReverseIndexRetainFilter.java @@ -0,0 +1,28 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse.query; + +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.btree.BTreeReader; +import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; + +public record ReverseIndexRetainFilter(BTreeReader range) implements QueryFilterStepIf { + + @Override + public void apply(LongQueryBuffer buffer) { + range.retainEntries(buffer); + buffer.finalizeFiltering(); + } + + public boolean test(long id) { + return range.findEntry(id) >= 0; + } + + @Override + public double cost() { + return range.numEntries(); + } + + @Override + public String describe() { + return "ReverseIndexRetainFilter"; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EmptyEntrySource.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EmptyEntrySource.java similarity index 59% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EmptyEntrySource.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EmptyEntrySource.java index 43b171ad..f38b4c0d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EmptyEntrySource.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EmptyEntrySource.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types; +package nu.marginalia.wmsa.edge.index.query; -import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.array.buffer.LongQueryBuffer; public class EmptyEntrySource implements EntrySource { @Override @@ -8,7 +8,7 @@ public class EmptyEntrySource implements EntrySource { } @Override - public void read(BTreeQueryBuffer buffer) { + public void read(LongQueryBuffer buffer) { buffer.zero(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySource.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySource.java new file mode 100644 index 00000000..5ec62c05 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySource.java @@ -0,0 +1,10 @@ +package nu.marginalia.wmsa.edge.index.query; + +import nu.marginalia.util.array.buffer.LongQueryBuffer; + +public interface EntrySource { + void skip(int n); + void read(LongQueryBuffer buffer); + + boolean hasMore(); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySourceFromArrayRange.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySourceFromArrayRange.java new file mode 100644 index 00000000..a0d9ee32 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/EntrySourceFromArrayRange.java @@ -0,0 +1,63 @@ +package nu.marginalia.wmsa.edge.index.query; + +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.buffer.LongQueryBuffer; + +import static java.lang.Math.min; + +public class EntrySourceFromArrayRange implements EntrySource { + + private final LongArray map; + private final int entrySize; + private long pos; + private final long endOffset; + + public EntrySourceFromArrayRange(LongArray map, int entrySize, long start, long end) { + this.map = map; + this.entrySize = entrySize; + this.pos = start; + this.endOffset = end; + } + + @Override + public void skip(int n) { + pos += (long) n * entrySize; + } + + @Override + public void read(LongQueryBuffer buffer) { + + assert buffer.end%entrySize == 0; + + buffer.end = min(buffer.end, (int)(endOffset - pos)); + + map.get(pos, pos + buffer.end, buffer.data); + + pos += buffer.end; + + destagger(buffer); + buffer.uniq(); + } + + private void destagger(LongQueryBuffer buffer) { + if (entrySize == 1) + return; + + for (int i = 0; (i + entrySize - 1) < buffer.end; i += entrySize) { + buffer.data[i / entrySize] = buffer.data[i + entrySize]; + } + + buffer.end /= entrySize; + } + + @Override + public boolean hasMore() { + return pos < endOffset; + } + + @Override + public String toString() { + return String.format("BTreeRange.EntrySourceFromMapRange(@" + pos + ": " + endOffset + ")"); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQuery.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQuery.java index bdd87297..b2994d36 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQuery.java @@ -1,8 +1,8 @@ -package nu.marginalia.wmsa.edge.index.svc.query; +package nu.marginalia.wmsa.edge.index.query; -import nu.marginalia.util.btree.BTreeQueryBuffer; -import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.wmsa.edge.index.query.EntrySource; +import nu.marginalia.wmsa.edge.index.query.filter.QueryFilterStepIf; import java.util.ArrayList; import java.util.List; @@ -26,7 +26,7 @@ public class IndexQuery { return si < sources.size(); } - public void getMoreResults(BTreeQueryBuffer dest) { + public void getMoreResults(LongQueryBuffer dest) { if (!fillBuffer(dest)) return; @@ -41,7 +41,7 @@ public class IndexQuery { } } - private boolean fillBuffer(BTreeQueryBuffer dest) { + private boolean fillBuffer(LongQueryBuffer dest) { for (;;) { dest.reset(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryIf.java similarity index 81% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryIf.java index b07515ed..0fd325ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryIf.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.svc.query; +package nu.marginalia.wmsa.edge.index.query; import java.util.stream.LongStream; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java new file mode 100644 index 00000000..298e6c01 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java @@ -0,0 +1,15 @@ +package nu.marginalia.wmsa.edge.index.query; + +import nu.marginalia.wmsa.edge.index.model.QueryStrategy; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet; +import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; + +public record IndexQueryParams(SpecificationLimit qualityLimit, + SpecificationLimit year, + SpecificationLimit size, + SearchSet searchSet, + QueryStrategy queryStrategy + ) +{ + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexResultDomainDeduplicator.java similarity index 89% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexResultDomainDeduplicator.java index bdb62571..40ed46fc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexResultDomainDeduplicator.java @@ -1,14 +1,14 @@ -package nu.marginalia.wmsa.edge.index.svc.query; +package nu.marginalia.wmsa.edge.index.query; import gnu.trove.map.TLongIntMap; import gnu.trove.map.hash.TLongIntHashMap; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; -public class ResultDomainDeduplicator { +public class IndexResultDomainDeduplicator { final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0); final int limitByDomain; - public ResultDomainDeduplicator(int limitByDomain) { + public IndexResultDomainDeduplicator(int limitByDomain) { this.limitByDomain = limitByDomain; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexSearchBudget.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexSearchBudget.java index b6229bd3..dfcbf06f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexSearchBudget.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.svc.query; +package nu.marginalia.wmsa.edge.index.query; public class IndexSearchBudget { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterAnyOf.java similarity index 91% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterAnyOf.java index 9944b89d..293fe7d0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterAnyOf.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types.filter; +package nu.marginalia.wmsa.edge.index.query.filter; -import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.array.buffer.LongQueryBuffer; import java.util.Arrays; import java.util.List; @@ -27,7 +27,7 @@ public class QueryFilterAnyOf implements QueryFilterStepIf { } - public void apply(BTreeQueryBuffer buffer) { + public void apply(LongQueryBuffer buffer) { int start; int end = buffer.end; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterLetThrough.java similarity index 50% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterLetThrough.java index f1a9a964..3f471cd8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterLetThrough.java @@ -1,17 +1,11 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types.filter; +package nu.marginalia.wmsa.edge.index.query.filter; -import nu.marginalia.util.btree.BTreeQueryBuffer; - -class QueryFilterNoPass implements QueryFilterStepIf { - static final QueryFilterStepIf instance = new QueryFilterNoPass(); +public class QueryFilterLetThrough implements QueryFilterStepIf { + static final QueryFilterStepIf instance = new QueryFilterLetThrough(); @Override public boolean test(long value) { - return false; - } - - public void apply(BTreeQueryBuffer buffer) { - buffer.finalizeFiltering(); + return true; } public double cost() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterNoPass.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterNoPass.java new file mode 100644 index 00000000..4ad69531 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterNoPass.java @@ -0,0 +1,25 @@ +package nu.marginalia.wmsa.edge.index.query.filter; + +import nu.marginalia.util.array.buffer.LongQueryBuffer; + +public class QueryFilterNoPass implements QueryFilterStepIf { + static final QueryFilterStepIf instance = new QueryFilterNoPass(); + + @Override + public boolean test(long value) { + return false; + } + + public void apply(LongQueryBuffer buffer) { + buffer.finalizeFiltering(); + } + + public double cost() { + return 0.; + } + + public String describe() { + return "[NoPass]"; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepExcludeFromPredicate.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepExcludeFromPredicate.java new file mode 100644 index 00000000..8cb4561f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepExcludeFromPredicate.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge.index.query.filter; + +import java.util.function.LongPredicate; + +public class QueryFilterStepExcludeFromPredicate implements QueryFilterStepIf { + private final LongPredicate pred; + + public QueryFilterStepExcludeFromPredicate(LongPredicate pred) { + this.pred = pred; + } + + @Override + public boolean test(long value) { + return !pred.test(value); + } + + @Override + public double cost() { + return 0; + } + + @Override + public String describe() { + return "[!Predicate]"; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepFromPredicate.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepFromPredicate.java index af2bca13..26207152 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepFromPredicate.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types.filter; +package nu.marginalia.wmsa.edge.index.query.filter; import java.util.function.LongPredicate; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepIf.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepIf.java index e1418e1d..9af75a7f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/filter/QueryFilterStepIf.java @@ -1,6 +1,6 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types.filter; +package nu.marginalia.wmsa.edge.index.query.filter; -import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.array.buffer.LongQueryBuffer; import java.util.List; @@ -24,7 +24,7 @@ public interface QueryFilterStepIf extends Comparable { * *

ASSUMPTION: buffer is sorted up until end.

*/ - default void apply(BTreeQueryBuffer buffer) { + default void apply(LongQueryBuffer buffer) { while (buffer.hasMore()) { if (test(buffer.currentValue())) { buffer.retainAndAdvance(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java deleted file mode 100644 index 72881f68..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java +++ /dev/null @@ -1,111 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader; - -import com.upserve.uppend.blobs.NativeIO; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.util.multimap.MultimapFileLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; -import java.util.function.LongConsumer; - -import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wordsBTreeContext; - -public class IndexWordsTable implements AutoCloseable { - protected final MultimapFileLong words; - protected final BTreeReader reader; - protected final int HEADER_OFFSET = 1; - final Logger logger = LoggerFactory.getLogger(getClass()); - - private static final int BUFFER_SIZE = 1024*1024*64; - - public IndexWordsTable(MultimapFileLong words) { - this.words = words; - - reader = new BTreeReader(words, wordsBTreeContext, HEADER_OFFSET); - - madvise(); - } - - public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException { - var wordsFile = openWordsFile(file); - long signature = wordsFile.get(0); - - if (signature == Strategy.BTREE.ordinal()) { - return new IndexWordsTable(wordsFile); - } - - throw new IllegalArgumentException("Unknown signature " + signature); - } - - private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { - return new MultimapFileLong(wordsFile, - FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE); - } - - public long positionForWord(int wordId) { - long offset = reader.findEntry(wordId); - - if (offset < 0) { - return -1L; - } - - return words.get(offset+1); - } - - public int wordLength(int wordId) { - - long offset = reader.findEntry(wordId); - if (offset < 0) { - return -1; - } - - return (int)(words.get(offset) >> 32); - } - - protected void madvise() { - words.advice(NativeIO.Advice.Random); - words.advice0(NativeIO.Advice.WillNeed); - - var h = reader.getHeader(); - int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs()); - - words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); - words.pokeRange(h.indexOffsetLongs(), length); - } - - public void forEachWordsOffset(LongConsumer offsetConsumer) { - int n = reader.numEntries(); - long offset = reader.getHeader().dataOffsetLongs(); - - for (int i = 0; i < n; i++) { - try { - long posOffset = 2*(offset + i); - if (posOffset * 8 >= words.size()) { - break; - } - - long sz = words.get(posOffset); - if ((sz>> 32) > 0) { - offsetConsumer.accept(words.get(posOffset+1)); - } - } - catch (Exception ex) { - logger.warn("Error @ " + i, ex); - break; - } - } - } - - @Override - public void close() throws Exception { - words.close(); - } - - public enum Strategy { - BTREE - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/MicroCache.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/MicroCache.java deleted file mode 100644 index d14cc845..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/MicroCache.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader; - -import java.util.Arrays; - -public class MicroCache { - private final int[] keys; - private final long[] data; - private int pos = 0; - - public int hit; - public int miss; - public int full; - - public static final long BAD_VALUE = Long.MIN_VALUE; - - public MicroCache(int size) { - keys = new int[size]; - data = new long[size]; - - Arrays.fill(data, BAD_VALUE); - } - - public long get(int key) { - for (int i = 0; i < keys.length && data[i] != BAD_VALUE; i++) { - if (keys[i] == key) { - hit++; - return data[i]; - } - } - miss++; - return BAD_VALUE; - } - - public void set(int key, long val) { - keys[pos] = key; - data[pos] = val; - - if (++pos >= keys.length) { - full++; - pos = 0; - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java deleted file mode 100644 index 9e5852e4..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java +++ /dev/null @@ -1,79 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader; - -import com.google.inject.Inject; -import com.google.inject.name.Named; -import com.upserve.uppend.blobs.NativeIO; -import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.util.multimap.MultimapFileLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; - -public class SearchIndex implements AutoCloseable { - - private final MultimapFileLong urls; - private final IndexWordsTable words; - public final String name; - private final RandomAccessFile wordsFile; - - private final Logger logger; - - @Inject - public SearchIndex( - String name, - @Named("edge-index-read-urls-file") File inUrls, - @Named("edge-index-read-words-file") File inWords) - throws IOException { - - logger = LoggerFactory.getLogger(name); - this.name = name; - wordsFile = new RandomAccessFile(inWords, "r"); - - logger.info("{} : Loading {}", name, inUrls); - logger.info("{} : Loading {}", name, inWords); - - urls = MultimapFileLong.forReading(inUrls.toPath()); - words = IndexWordsTable.ofFile(wordsFile); - - Schedulers.io().scheduleDirect(() -> madvise(urls)); - } - - private void madvise(MultimapFileLong urls) { - - words.forEachWordsOffset(offset -> { - var h = BTreeReader.createHeader(urls, offset); - long length = h.dataOffsetLongs() - h.indexOffsetLongs(); - - urls.adviceRange(NativeIO.Advice.WillNeed, offset, 512); - - if (length > 0) { - urls.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); - } - }); - } - - - public long numUrls(int wordId) { - int length = words.wordLength(wordId); - if (length < 0) return 0; - if (length > 0) return length; - - return rangeForWord(wordId).numEntries(); - } - - public SearchIndexURLRange rangeForWord(int wordId) { - return new SearchIndexURLRange(urls, words.positionForWord(wordId)); - } - - @Override - public void close() throws Exception { - urls.close(); - words.close(); - - wordsFile.close(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java deleted file mode 100644 index bb991898..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ /dev/null @@ -1,117 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader; - -import com.google.inject.Inject; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.svc.query.IndexDomainQueryFactory; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.EnumMap; -import java.util.List; -import java.util.Objects; -import java.util.stream.Stream; - -public class SearchIndexReader implements AutoCloseable { - - private final EnumMap indices; - private final EnumMap queryBuilders; - private final IndexDomainQueryFactory domainQueryFactory; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Inject - public SearchIndexReader( - EnumMap indices) { - this.indices = indices; - - var linkIndex = indices.get(IndexBlock.Link); - var titleIndex = indices.get(IndexBlock.Title); - var metaIndex = indices.get(IndexBlock.Meta); - - var words1 = indices.get(IndexBlock.Words_1); - var words2 = indices.get(IndexBlock.Words_2); - var words4 = indices.get(IndexBlock.Words_4); - var words8 = indices.get(IndexBlock.Words_8); - var words16 = indices.get(IndexBlock.Words_16Plus); - var artifacts = indices.get(IndexBlock.Artifacts); - - queryBuilders = new EnumMap<>(IndexBlock.class); - - List excludeIndices = listOfNonNulls(metaIndex, titleIndex, words1, words2, words4, words8, words16); - - queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices)); - queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices)); - queryBuilders.put(IndexBlock.Words_2, new IndexQueryFactory(listOfNonNulls(metaIndex, words2), excludeIndices)); - queryBuilders.put(IndexBlock.Words_4, new IndexQueryFactory(listOfNonNulls(metaIndex, words4), excludeIndices)); - queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices)); - queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices)); - - domainQueryFactory = new IndexDomainQueryFactory(indices.get(IndexBlock.Words_1)); - } - - @SafeVarargs - public final List listOfNonNulls(T... vals) { - return Stream.of(vals).filter(Objects::nonNull).toList(); - } - - - public IndexQueryFactory.IndexQueryBuilder findWord(IndexBlock block, Integer quality, int wordId) { - var builder = queryBuilders.get(block); - - if (builder == null) - return null; - - if (quality == null) { - return builder.buildQuery(wordId); - } - else { - return builder.buildQuery(quality, wordId); - } - } - - public IndexQueryFactory.IndexQueryBuilder findWordForDomainList(IndexBlock block, List domains, int wordId) { - var builder = queryBuilders.get(block); - - if (builder == null) - return null; - - return builder.buildQuery(domains, wordId); - } - - public IndexQuery findDomain(int wordId) { - return domainQueryFactory.buildQuery(wordId); - } - - @Override - public void close() throws Exception { - for (var idx : indices.values()) { - idx.close(); - } - } - - @SneakyThrows - public long numHits(IndexBlock block, int word) { - IndexQueryFactory builder = queryBuilders.get(block); - - if (builder == null) - return 0L; - - long hits = 0; - for (var index : builder.getIndicies()) { - hits += index.numUrls(word); - } - return hits; - } - - - public long[] getMetadata(IndexBlock block, int termId, long[] ids) { - final var index = indices.get(block); - if (null == index) { - return new long[ids.length]; - } - - return indices.get(block).rangeForWord(termId).getMetadata(ids); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexURLRange.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexURLRange.java deleted file mode 100644 index 916aab9d..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexURLRange.java +++ /dev/null @@ -1,100 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader; - -import it.unimi.dsi.fastutil.longs.LongLongImmutablePair; -import nu.marginalia.util.btree.BTreeQueryBuffer; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.svc.query.types.EmptyEntrySource; -import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; -import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromBTree; -import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromMapRange; - -import javax.annotation.Nullable; - -import static nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags.*; - -public class SearchIndexURLRange { - public final long dataOffset; - private final MultimapFileLong urlsFile; - - @Nullable - private final BTreeReader reader; - - public SearchIndexURLRange(MultimapFileLong urlsFile, long dataOffset) { - this.dataOffset = dataOffset; - this.urlsFile = urlsFile; - - if (dataOffset >= 0) { - this.reader = new BTreeReader(urlsFile, SearchIndexConverter.urlsBTreeContext, dataOffset); - } else { - this.reader = null; - } - } - - public EntrySource asPrefixSource(long prefix, long prefixNext) { - if (reader == null) - return new EmptyEntrySource(); - - LongLongImmutablePair startAndEnd = reader.getRangeForPrefix(prefix, prefixNext); - - if (startAndEnd.firstLong() == startAndEnd.secondLong()) { - return new EmptyEntrySource(); - } - - return new EntrySourceFromMapRange(urlsFile, startAndEnd.firstLong(), startAndEnd.secondLong()); - } - - public EntrySource asEntrySource() { - return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, null); - } - public EntrySource asQualityLimitingEntrySource(int limit) { - return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, limit); - } - public EntrySource asDomainEntrySource() { - return new EntrySourceFromBTree(reader, Subjects.asBit() | Site.asBit() | Title.asBit(), null); - } - - public boolean isPresent() { - return dataOffset >= 0; - } - - public long numEntries() { - if (reader == null) - return 0L; - - return reader.numEntries(); - } - - public void retainUrls(BTreeQueryBuffer buffer) { - if (reader != null) - reader.retainEntries(buffer); - } - - public void rejectUrls(BTreeQueryBuffer buffer) { - if (reader != null) - reader.rejectEntries(buffer); - } - - public boolean hasUrl(long url) { - if (reader == null) - return false; - - return reader.findEntry(url) >= 0; - } - - - public long[] getMetadata(long[] urls) { - if (reader == null) { - return new long[urls.length]; - } - - return reader.queryData(urls, 1); - } - - @Override - public String toString() { - return String.format("BTreeRange(@" + dataOffset + ", size = " + numEntries() + ")"); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java deleted file mode 100644 index 828714c7..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java +++ /dev/null @@ -1,154 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; -import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; -import java.util.concurrent.locks.ReentrantLock; - -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; - -@Singleton -public class SearchIndexes { - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final EdgeIndexBucket[] buckets - = new EdgeIndexBucket[DYNAMIC_BUCKET_LENGTH + 1]; - private final IndexServicesFactory servicesFactory; - private final SearchIndexPartitioner partitioner; - - private final ReentrantLock opsLock = new ReentrantLock(false); - - private final SearchIndexJournalWriterImpl primaryIndexWriter; - private final SearchIndexJournalWriterImpl secondaryIndexWriter; - private KeywordLexiconReadOnlyView keywordLexiconReadOnlyView = null; - - @Inject - public SearchIndexes(IndexServicesFactory servicesFactory, SearchIndexPartitioner partitioner) { - this.servicesFactory = servicesFactory; - this.partitioner = partitioner; - - this.primaryIndexWriter = servicesFactory.getIndexWriter(0); - this.secondaryIndexWriter = servicesFactory.getIndexWriter(1); - - for (int i = 0; i < buckets.length; i++) { - buckets[i] = servicesFactory.createIndexBucket(i); - } - } - - public boolean repartition() { - - if (!opsLock.tryLock()) { - return false; - } - try { - partitioner.reloadPartitions(); - } - finally { - opsLock.unlock(); - } - - return true; - } - - public boolean preconvert() { - - if (!opsLock.tryLock()) { - return false; - } - try { - buckets[0].preconvert(); - } - finally { - opsLock.unlock(); - } - - return true; - } - - public boolean reindex(int id) { - - if (!opsLock.tryLock()) { - return false; - } - try { - buckets[id].switchIndex(); - } - finally { - opsLock.unlock(); - } - - return true; - } - - public boolean reindexAll() { - if (!opsLock.tryLock()) { - return false; - } - try { - for (var bucket : buckets) { - bucket.switchIndex(); - } - } finally { - opsLock.unlock(); - } - - return true; - } - - @Nullable - public KeywordLexiconReadOnlyView getLexiconReader() { - return keywordLexiconReadOnlyView; - } - - - public boolean isBusy() { - return partitioner.isBusy(); - } - - public void initialize(Initialization init) { - - logger.info("Waiting for init"); - init.waitReady(); - - opsLock.lock(); - try { - logger.info("Initializing buckets"); - for (EdgeIndexBucket bucket : buckets) { - bucket.init(); - } - - logger.info("Initializing dictionary reader"); - keywordLexiconReadOnlyView = servicesFactory.getDictionaryReader(); - } - finally { - opsLock.unlock(); - } - } - - public SearchIndexJournalWriterImpl getIndexWriter(int idx) { - if (idx == 0) { - return primaryIndexWriter; - } - else { - return secondaryIndexWriter; - } - } - - public EdgeIndexBucket getBucket(int bucketId) { - return buckets[bucketId]; - } - - public boolean isValidBucket(int bucketId) { - return bucketId >= 0 && bucketId < buckets.length; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java index b76b65a6..a51352c1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java @@ -4,12 +4,12 @@ import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; import io.prometheus.client.Histogram; -import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.array.buffer.LongQueryBuffer; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; -import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator; +import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; +import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator; +import nu.marginalia.wmsa.edge.index.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeIdList; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; @@ -24,7 +24,6 @@ import spark.Spark; import java.util.OptionalInt; -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; import static spark.Spark.halt; @Singleton @@ -36,10 +35,10 @@ public class EdgeIndexDomainQueryService { private final Gson gson = GsonFactory.get(); - private final SearchIndexes indexes; + private final SearchIndexControl indexes; @Inject - public EdgeIndexDomainQueryService(SearchIndexes indexes) { + public EdgeIndexDomainQueryService(SearchIndexControl indexes) { this.indexes = indexes; } @@ -53,7 +52,9 @@ public class EdgeIndexDomainQueryService { EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class); try { - return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet)); + return new EdgeDomainSearchResults("", new EdgeIdList<>()); + // fixme + // return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet)); } catch (HaltException ex) { logger.warn("Halt", ex); @@ -78,21 +79,19 @@ public class EdgeIndexDomainQueryService { return new EdgeDomainSearchResults(specsSet.keyword, urlIds); } - BTreeQueryBuffer buffer = new BTreeQueryBuffer(512); + LongQueryBuffer buffer = new LongQueryBuffer(512); - for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) { - final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1); - var query = indexes.getBucket(bucket).getDomainQuery(wordId.getAsInt(), localFilter); + final IndexResultDomainDeduplicator localFilter = new IndexResultDomainDeduplicator(1); + var query = indexes.getIndex().getDomainQuery(wordId.getAsInt(), localFilter); - while (query.hasMore() && urlIds.size() < specsSet.maxResults) { - query.getMoreResults(buffer); + while (query.hasMore() && urlIds.size() < specsSet.maxResults) { + query.getMoreResults(buffer); - for (int i = 0; i < buffer.end && urlIds.size() < specsSet.maxResults; i++) { - long result = buffer.data[i]; - if (localFilter.test(result)) { - urlIds.add((int) (result & 0xFFFF_FFFFL)); - } + for (int i = 0; i < buffer.end && urlIds.size() < specsSet.maxResults; i++) { + long result = buffer.data[i]; + if (localFilter.test(result)) { + urlIds.add((int) (result & 0xFFFF_FFFFL)); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java index 520b559f..7aa33038 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java @@ -3,16 +3,16 @@ package nu.marginalia.wmsa.edge.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.protobuf.InvalidProtocolBufferException; -import nu.marginalia.util.ListChunker; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; +import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; @@ -26,16 +26,16 @@ import java.util.Arrays; @Singleton public class EdgeIndexLexiconService { - private final SearchIndexes indexes; + private final SearchIndexControl indexes; private final KeywordLexicon keywordLexicon; @Inject - public EdgeIndexLexiconService(SearchIndexes indexes, IndexServicesFactory servicesFactory) { + public EdgeIndexLexiconService(SearchIndexControl indexes, IndexServicesFactory servicesFactory) { this.indexes = indexes; this.keywordLexicon = servicesFactory.getKeywordLexicon(); } - public EdgeIndexLexiconService(SearchIndexes indexes, KeywordLexicon lexicon) { + public EdgeIndexLexiconService(SearchIndexControl indexes, KeywordLexicon lexicon) { this.indexes = indexes; this.keywordLexicon = lexicon; } @@ -59,6 +59,9 @@ public class EdgeIndexLexiconService { return wordId; } + public long getOrInsertWord(String word) { + return keywordLexicon.getOrInsert(word); + } public Object putWords(Request request, Response response) throws InvalidProtocolBufferException { var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes()); @@ -75,20 +78,25 @@ public class EdgeIndexLexiconService { return ""; } + public void putWords(int idx, SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry) { + SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); + + indexWriter.put(header, entry); + } + public void putWords(EdgeId domainId, EdgeId urlId, IndexPutKeywordsReq.WordSet words, int idx ) { SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); - IndexBlock block = IndexBlock.values()[words.getIndex()]; var wordArray = words.getWordsList().toArray(String[]::new); var metaArray = words.getMetaList().stream().mapToLong(Long::valueOf).toArray(); - DocumentKeywords documentKeywords = new DocumentKeywords(block, wordArray, metaArray); - for (var chunk : ListChunker.chopList(documentKeywords, SearchIndexJournalEntry.MAX_LENGTH)) { + DocumentKeywords documentKeywords = new DocumentKeywords(wordArray, metaArray); + for (var chunk : KeywordListChunker.chopList(documentKeywords, SearchIndexJournalEntry.MAX_LENGTH)) { var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata())); - var header = new SearchIndexJournalEntryHeader(domainId, urlId, block); + var header = new SearchIndexJournalEntryHeader(domainId, urlId, EdgePageDocumentsMetadata.defaultValue()); indexWriter.put(header, entry); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java index 668890cb..bf1fd459 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; import spark.Request; import spark.Response; import spark.Spark; @@ -10,32 +10,30 @@ import spark.Spark; @Singleton public class EdgeIndexOpsService { - private final SearchIndexes indexes; + private final SearchIndexControl indexes; + private final EdgeOpsLockService opsLockService; + private final EdgeIndexSearchSetsService searchSetService; @Inject - public EdgeIndexOpsService(SearchIndexes indexes) { + public EdgeIndexOpsService(SearchIndexControl indexes, + EdgeOpsLockService opsLockService, + EdgeIndexSearchSetsService searchSetService) { this.indexes = indexes; + this.opsLockService = opsLockService; + this.searchSetService = searchSetService; } - public Object repartitionEndpoint(Request request, Response response) { + public Object repartitionEndpoint(Request request, Response response) throws Exception { - if (!indexes.repartition()) { + if (!opsLockService.run(searchSetService::recalculateAll)) { Spark.halt(503, "Operations busy"); } return "OK"; } - public Object preconvertEndpoint(Request request, Response response) { - if (!indexes.preconvert()) { - Spark.halt(503, "Operations busy"); - } - return "OK"; - } + public Object reindexEndpoint(Request request, Response response) throws Exception { - public Object reindexEndpoint(Request request, Response response) { - int id = Integer.parseInt(request.params("id")); - - if (!indexes.reindex(id)) { + if (!indexes.reindex()) { Spark.halt(503, "Operations busy"); } return "OK"; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java index 34bcc93a..0b8c08f4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -3,27 +3,30 @@ package nu.marginalia.wmsa.edge.index.svc; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; -import gnu.trove.set.hash.TIntHashSet; +import gnu.trove.list.TLongList; +import gnu.trove.list.array.TLongArrayList; +import gnu.trove.set.hash.TLongHashSet; import io.prometheus.client.Counter; import io.prometheus.client.Gauge; import io.prometheus.client.Histogram; import it.unimi.dsi.fastutil.ints.IntArrayList; -import it.unimi.dsi.fastutil.ints.IntComparator; import it.unimi.dsi.fastutil.ints.IntList; -import it.unimi.dsi.fastutil.ints.IntOpenHashSet; -import it.unimi.dsi.fastutil.longs.LongAVLTreeSet; -import nu.marginalia.util.btree.BTreeQueryBuffer; +import nu.marginalia.util.array.buffer.LongQueryBuffer; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams; -import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator; -import nu.marginalia.wmsa.edge.model.search.*; +import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms; +import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator; +import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; +import nu.marginalia.wmsa.edge.index.query.IndexQuery; +import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; +import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator; +import nu.marginalia.wmsa.edge.index.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet; +import nu.marginalia.wmsa.edge.index.svc.searchset.SmallSearchSet; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; import org.apache.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,11 +35,12 @@ import spark.Request; import spark.Response; import spark.Spark; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.OptionalInt; import java.util.function.LongPredicate; -import java.util.stream.Collectors; -import static java.util.Comparator.comparing; +import static java.util.Comparator.comparingDouble; import static spark.Spark.halt; @Singleton @@ -44,20 +48,19 @@ public class EdgeIndexQueryService { private final Logger logger = LoggerFactory.getLogger(getClass()); - private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64; - private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register(); - private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register(); private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register(); private final Gson gson = GsonFactory.get(); - private final SearchIndexes indexes; + private final SearchIndexControl indexes; + private final EdgeIndexSearchSetsService searchSetsService; @Inject - public EdgeIndexQueryService(SearchIndexes indexes) { + public EdgeIndexQueryService(SearchIndexControl indexes, EdgeIndexSearchSetsService searchSetsService) { this.indexes = indexes; + this.searchSetsService = searchSetsService; } public Object search(Request request, Response response) { @@ -69,7 +72,6 @@ public class EdgeIndexQueryService { String json = request.body(); EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class); - try { return wmsa_edge_index_query_time.time(() -> query(specsSet)); } @@ -102,258 +104,169 @@ public class EdgeIndexQueryService { private class SearchQuery { private final int fetchSize; - private final TIntHashSet seenResults; - private final EdgeSearchSpecification specsSet; private final IndexSearchBudget budget; - private final Integer qualityLimit; - private final Integer rankLimit; + private final List subqueries; private long dataCost = 0; + private final IndexQueryParams queryParams; + + private final int limitByDomain; + private final int limitTotal; + + TLongHashSet consideredUrlIds; public SearchQuery(EdgeSearchSpecification specsSet) { - this.specsSet = specsSet; - this.budget = new IndexSearchBudget(specsSet.timeoutMs); this.fetchSize = specsSet.fetchSize; - this.seenResults = new TIntHashSet(fetchSize, 0.5f); - this.qualityLimit = specsSet.quality; - this.rankLimit = specsSet.rank; + this.budget = new IndexSearchBudget(specsSet.timeoutMs); + this.subqueries = specsSet.subqueries; + this.limitByDomain = specsSet.limitByDomain; + this.limitTotal = specsSet.limitTotal; + + this.consideredUrlIds = new TLongHashSet(fetchSize * 4); + + queryParams = new IndexQueryParams( + specsSet.quality, + specsSet.year, + specsSet.size, + getSearchSet(specsSet), + specsSet.queryStrategy); } private List execute() { - final Set results = new HashSet<>(fetchSize); + final TLongList results = new TLongArrayList(fetchSize); - for (var sq : specsSet.subqueries) { - results.addAll(performSearch(sq)); - } + for (var sq : subqueries) { + final EdgeIndexQuerySearchTerms searchTerms = getSearchTerms(sq); - final SearchTermEvaluator evaluator = new SearchTermEvaluator(specsSet, results); - for (var result : results) { - evaluator.addResultScores(result); - } + if (searchTerms.isEmpty()) { + continue; + } - return createResultList(results); - } - - private List createResultList(Set results) { - - var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain); - - List resultList = results.stream() - .sorted( - comparing(EdgeSearchResultItem::getScore) - .thenComparing(EdgeSearchResultItem::getRanking) - .thenComparing(EdgeSearchResultItem::getUrlIdInt) - ) - .filter(domainCountFilter::test) - .collect(Collectors.toList()); - - if (resultList.size() > specsSet.getLimitTotal()) { - // This can't be made a stream limit() operation because we need domainCountFilter - // to run over the entire list to provide accurate statistics - - resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear(); - } - - for (var result : resultList) { - result.resultsFromDomain = domainCountFilter.getCount(result); - } - - return resultList; - } - - - private List performSearch(EdgeSearchSubquery sq) - { - - final List results = new ArrayList<>(fetchSize); - final SearchTerms searchTerms = getSearchTerms(sq); - - if (searchTerms.isEmpty()) { - return Collections.emptyList(); - } - - final BTreeQueryBuffer buffer = new BTreeQueryBuffer(fetchSize); - - for (int indexBucket : specsSet.buckets) { - final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT); + TLongArrayList resultsForSubquery = performSearch(searchTerms); + results.addAll(resultsForSubquery); if (!budget.hasTimeLeft()) { - logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", - indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude); - continue; - - } - - if (results.size() >= fetchSize) { + logger.info("Query timed out {}, ({}), -{}", + sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude); break; } + } - IndexQueryParams queryParams = new IndexQueryParams(sq.block, searchTerms, qualityLimit, rankLimit, specsSet.domains); + final var evaluator = new IndexResultValuator(indexes, results, subqueries); - IndexQuery query = getQuery(indexBucket, localFilter::filterRawValue, queryParams); + ArrayList items = new ArrayList<>(results.size()); + ArrayList refusedItems = new ArrayList<>(results.size()); - while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) { - buffer.reset(); - query.getMoreResults(buffer); + // Sorting the result ids results in better paging characteristics + results.sort(); - for (int i = 0; i < buffer.size() && results.size() < fetchSize; i++) { - final long id = buffer.data[i]; + results.forEach(id -> { + var item = evaluator.evaluateResult(id); - if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) { - continue; - } - - results.add(new EdgeSearchResultItem(indexBucket, sq.block, id)); - } + // Score value is zero when the best query variant consists of low-value terms that are just scattered + // throughout the document, with no indicators of importance associated with them. + if (item.getScoreValue() < 0) { + items.add(item); + } + else { + refusedItems.add(item); } - dataCost += query.dataCost(); + return true; + }); + if (items.isEmpty()) { + items.addAll(refusedItems); } + return selectResults(items); + } + + + private TLongArrayList performSearch(EdgeIndexQuerySearchTerms terms) + { + final TLongArrayList results = new TLongArrayList(fetchSize); + final LongQueryBuffer buffer = new LongQueryBuffer(fetchSize); + + + IndexQuery query = getQuery(terms, queryParams, consideredUrlIds::add); + + while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) { + buffer.reset(); + query.getMoreResults(buffer); + + for (int i = 0; i < buffer.size() && results.size() < fetchSize; i++) { + results.add(buffer.data[i]); + } + } + + dataCost += query.dataCost(); + return results; } - private IndexQuery getQuery(int bucket, LongPredicate filter, IndexQueryParams params) { + private SearchSet getSearchSet(EdgeSearchSpecification specsSet) { - if (!indexes.isValidBucket(bucket)) { - logger.warn("Invalid bucket {}", bucket); - return new IndexQuery(Collections.emptyList()); + if (specsSet.domains != null && !specsSet.domains.isEmpty()) { + return new SmallSearchSet(specsSet.domains); } - return indexes.getBucket(bucket).getQuery(filter, params); + return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier); + } + + private List selectResults(List results) { + + var domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain); + + results.sort(comparingDouble(EdgeSearchResultItem::getScore) + .thenComparingInt(EdgeSearchResultItem::getRanking) + .thenComparingInt(EdgeSearchResultItem::getUrlIdInt)); + + List resultsList = new ArrayList<>(results.size()); + + for (var item : results) { + if (domainCountFilter.test(item)) { + resultsList.add(item); + } + } + + if (resultsList.size() > limitTotal) { + // This can't be made a stream limit() operation because we need domainCountFilter + // to run over the entire list to provide accurate statistics + + resultsList.subList(limitTotal, resultsList.size()).clear(); + } + + for (var result : resultsList) { + result.resultsFromDomain = domainCountFilter.getCount(result); + } + + return resultsList; + } + + private IndexQuery getQuery(EdgeIndexQuerySearchTerms terms, IndexQueryParams params, LongPredicate includePred) { + return indexes.getIndex().getQuery(terms, params, includePred); } public boolean hasTimeLeft() { return budget.hasTimeLeft(); } - private record IndexAndBucket(IndexBlock block, int bucket) {} - public long getDataCost() { return dataCost; } - record ResultTerm (int bucket, int termId, long combinedUrlId) {} } - public class SearchTermEvaluator { - private static final EdgePageWordMetadata blankMetadata = new EdgePageWordMetadata(EdgePageWordMetadata.emptyValue()); - - private final Map termData = new HashMap<>(16); - - private final List> searchTermVariants; - - public SearchTermEvaluator(EdgeSearchSpecification specsSet, Set results) { - this.searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); - - final int[] termIdsAll = getIncludeTermIds(specsSet); - - Map resultIdsByBucket = new HashMap<>(7); - - for (int termId : termIdsAll) { - - for (var result: results) { - resultIdsByBucket - .computeIfAbsent(new SearchQuery.IndexAndBucket(result.block, result.bucketId), - id -> new LongAVLTreeSet()) - .add(result.combinedId); - } - - resultIdsByBucket.forEach((indexAndBucket, resultIds) -> - loadMetadata(termId, indexAndBucket.bucket, indexAndBucket.block, resultIds)); - - resultIdsByBucket.clear(); - } - } - - private int[] getIncludeTermIds(EdgeSearchSpecification specsSet) { - - final var reader = Objects.requireNonNull(indexes.getLexiconReader()); - - final List terms = specsSet.allIncludeSearchTerms(); - final IntList ret = new IntArrayList(terms.size()); - - for (var term : terms) { - int id = reader.get(term); - - if (id >= 0) - ret.add(id); - } - - return ret.toIntArray(); - } - - private void loadMetadata(int termId, int bucket, IndexBlock indexBlock, - LongAVLTreeSet docIdsMissingMetadata) - { - EdgeIndexBucket index = indexes.getBucket(bucket); - - if (docIdsMissingMetadata.isEmpty()) - return; - - - long[] ids = docIdsMissingMetadata.toLongArray(); - long[] metadata = index.getMetadata(indexBlock, termId, ids); - - for (int i = 0; i < metadata.length; i++) { - if (metadata[i] == 0L) - continue; - - termData.put( - new SearchQuery.ResultTerm(bucket, termId, ids[i]), - new EdgePageWordMetadata(metadata[i]) - ); - - docIdsMissingMetadata.remove(ids[i]); - } - } - - public void addResultScores(EdgeSearchResultItem searchResult) { - final var reader = Objects.requireNonNull(indexes.getLexiconReader()); - - double bestScore = 0; - - for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) { - double setScore = 0; - int setSize = 0; - var termList = searchTermVariants.get(searchTermListIdx); - - for (int termIdx = 0; termIdx < termList.size(); termIdx++) { - String searchTerm = termList.get(termIdx); - - final int termId = reader.get(searchTerm); - - var key = new SearchQuery.ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()); - var metadata = termData.getOrDefault(key, blankMetadata); - - EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore(searchTermListIdx, searchTerm, metadata); - - searchResult.scores.add(score); - setScore += score.termValue(); - if (termIdx == 0) { - setScore += score.documentValue(); - } - - setSize++; - } - bestScore = Math.min(bestScore, setScore/setSize); - } - - searchResult.setScore(bestScore); - } - - - } - - private SearchTerms getSearchTerms(EdgeSearchSubquery request) { + private EdgeIndexQuerySearchTerms getSearchTerms(EdgeSearchSubquery request) { final IntList excludes = new IntArrayList(); final IntList includes = new IntArrayList(); + final IntList priority = new IntArrayList(); for (var include : request.searchTermsInclude) { var word = lookUpWord(include); if (word.isEmpty()) { logger.debug("Unknown search term: " + include); - return new SearchTerms(); + return new EdgeIndexQuerySearchTerms(); } includes.add(word.getAsInt()); } @@ -362,7 +275,7 @@ public class EdgeIndexQueryService { var word = lookUpWord(advice); if (word.isEmpty()) { logger.debug("Unknown search term: " + advice); - return new SearchTerms(); + return new EdgeIndexQuerySearchTerms(); } includes.add(word.getAsInt()); } @@ -370,27 +283,11 @@ public class EdgeIndexQueryService { for (var exclude : request.searchTermsExclude) { lookUpWord(exclude).ifPresent(excludes::add); } - - return new SearchTerms(includes, excludes); - } - - public record SearchTerms(IntList includes, IntList excludes) { - public SearchTerms() { - this(IntList.of(), IntList.of()); + for (var exclude : request.searchTermsPriority) { + lookUpWord(exclude).ifPresent(priority::add); } - public boolean isEmpty() { - return includes.isEmpty(); - } - - public int[] sortedDistinctIncludes(IntComparator comparator) { - if (includes.isEmpty()) - return includes.toIntArray(); - - IntList list = new IntArrayList(new IntOpenHashSet(includes)); - list.sort(comparator); - return list.toIntArray(); - } + return new EdgeIndexQuerySearchTerms(includes, excludes, priority); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java similarity index 54% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java index ce2c30d2..a09047eb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java @@ -1,41 +1,64 @@ -package nu.marginalia.wmsa.edge.index.conversion; +package nu.marginalia.wmsa.edge.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; -import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterStandardPageRank; import nu.marginalia.util.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.model.RankingSettings; +import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; +import org.roaringbitmap.RoaringBitmap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; + @Singleton -public class SearchIndexDao { +public class EdgeIndexSearchSetsService { private final HikariDataSource dataSource; private RankingDomainFetcher rankingDomains; private final RankingSettings rankingSettings; private final Logger logger = LoggerFactory.getLogger(getClass()); + private final SearchSet anySet = new SearchSetAny(); + private volatile RankingSearchSet retroSet; + private volatile RankingSearchSet smallWebSet; + private volatile RankingSearchSet academiaSet; + @Inject - public SearchIndexDao(HikariDataSource dataSource, - RankingDomainFetcher rankingDomains, - RankingSettings rankingSettings) - { + public EdgeIndexSearchSetsService(HikariDataSource dataSource, + RankingDomainFetcher rankingDomains, + RankingSettings rankingSettings, + IndexServicesFactory servicesFactory) throws IOException { this.dataSource = dataSource; this.rankingDomains = rankingDomains; this.rankingSettings = rankingSettings; + + smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); + academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat")); + retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat")); + logger.info("SearchIndexDao ranking settings = {}", rankingSettings); } + public void recalculateAll() { + updateAcademiaDomains(); + updateRetroDomains(); + updateSmallWebDomains(); + } + @SneakyThrows - public TIntHashSet goodUrls() { - TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1); - TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1); + public RoaringBitmap goodUrls() { + RoaringBitmap domains = new RoaringBitmap(); + RoaringBitmap urls = new RoaringBitmap(); try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) { @@ -47,7 +70,6 @@ public class SearchIndexDao { } // For some reason, doing this "INNER JOIN" in Java is significantly faster than doing it in SQL - try (var stmt = connection.prepareStatement("SELECT ID,DOMAIN_ID FROM EC_URL WHERE VISITED AND EC_URL.STATE='OK'")) { stmt.setFetchSize(10_000); var rsp = stmt.executeQuery(); @@ -64,24 +86,37 @@ public class SearchIndexDao { } @SneakyThrows - public TIntList getRetroDomains() { + public void updateRetroDomains() { var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2); + var data = spr.pageRankWithPeripheralNodes(spr.size() / 2); + + synchronized (this) { + retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data); + retroSet.write(); + } } @SneakyThrows - public TIntList getSmallWebDomains() { + public void updateSmallWebDomains() { var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new)); - rpr.setMaxKnownUrls(750); + var data = rpr.pageRankWithPeripheralNodes(rpr.size()); - return rpr.pageRankWithPeripheralNodes(rpr.size()); + synchronized (this) { + smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data); + smallWebSet.write(); + } } @SneakyThrows - public TIntList getAcademiaDomains() { + public void updateAcademiaDomains() { var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new)); - return spr.pageRankWithPeripheralNodes(spr.size()/2); + var data = spr.pageRankWithPeripheralNodes(spr.size()/2); + + synchronized (this) { + academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data); + academiaSet.write(); + } } @SneakyThrows @@ -120,4 +155,16 @@ public class SearchIndexDao { } return results; } + + public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) { + if (null == searchSetIdentifier) { + return anySet; + } + return switch (searchSetIdentifier) { + case NONE -> anySet; + case RETRO -> retroSet; + case ACADEMIA -> academiaSet; + case SMALLWEB -> smallWebSet; + }; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeOpsLockService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeOpsLockService.java new file mode 100644 index 00000000..99dbd5fb --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeOpsLockService.java @@ -0,0 +1,42 @@ +package nu.marginalia.wmsa.edge.index.svc; + +import javax.annotation.CheckReturnValue; +import javax.inject.Singleton; +import java.util.Optional; +import java.util.concurrent.Callable; +import java.util.concurrent.locks.ReentrantLock; + +@Singleton +public class EdgeOpsLockService { + public ReentrantLock opsLock = new ReentrantLock(); + + @CheckReturnValue + public Optional run(Callable c) throws Exception { + if (!opsLock.tryLock()) + return Optional.empty(); + try { + return Optional.of(c.call()); + } + finally { + opsLock.unlock(); + } + } + + + @CheckReturnValue + public boolean run(Runnable r) { + if (!opsLock.tryLock()) + return false; + try { + r.run(); + return true; + } + finally { + opsLock.unlock(); + } + } + + public boolean isLocked() { + return opsLock.isLocked(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexDomainQueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexDomainQueryFactory.java deleted file mode 100644 index d96b710e..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexDomainQueryFactory.java +++ /dev/null @@ -1,33 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query; - -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -public class IndexDomainQueryFactory { - SearchIndex baseIndex; - - public IndexDomainQueryFactory(SearchIndex sourceIndex) { - this.baseIndex = sourceIndex; - } - - public IndexQuery buildQuery(int firstWordId) { - if (baseIndex == null) { - return new IndexQuery(Collections.emptyList()); - } - - List sources = new ArrayList<>(1); - - var range = baseIndex.rangeForWord(firstWordId); - if (range.isPresent()) { - sources.add(range.asDomainEntrySource()); - } - - return new IndexQuery(sources); - } - -} - diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java deleted file mode 100644 index f2707a99..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java +++ /dev/null @@ -1,134 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query; - -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import nu.marginalia.wmsa.edge.index.svc.query.types.EmptyEntrySource; -import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRangeReject; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRangeRetain; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; - -import java.util.*; -import java.util.stream.Collectors; - -public class IndexQueryFactory { - private final List requiredIndices; - private final List excludeIndex; - - public Collection getIndicies() { - return requiredIndices; - } - - public IndexQueryFactory(List requiredIndices, List excludeIndex) { - this.requiredIndices = requiredIndices.stream().filter(Objects::nonNull).collect(Collectors.toList()); - this.excludeIndex = excludeIndex; - } - - public IndexQueryBuilder buildQuery(int firstWordId) { - List sources = new ArrayList<>(requiredIndices.size()); - - for (var ri : requiredIndices) { - var range = ri.rangeForWord(firstWordId); - if (range.isPresent()) { - sources.add(range.asEntrySource()); - } - } - - return new IndexQueryBuilder(new IndexQuery(sources)); - } - - public IndexQueryBuilder buildQuery(int quality, int wordId) { - List sources = new ArrayList<>(requiredIndices.size()); - - for (var ri : requiredIndices) { - var range = ri.rangeForWord(wordId); - if (range.isPresent()) { - sources.add(range.asQualityLimitingEntrySource(quality)); - } - } - - return new IndexQueryBuilder(new IndexQuery(sources)); - } - - public IndexQueryBuilder buildQuery(List domains, int wordId) { - List sources = new ArrayList<>(requiredIndices.size()); - - for (var ri : requiredIndices) { - var range = ri.rangeForWord(wordId); - - if (range.isPresent()) { - for (int dom : domains) { - long prefix = (long) dom << 32L; - long prefixNext = prefix + 0x0000_0001_0000_0000L; - - var source = range.asPrefixSource(prefix, prefixNext); - if (source.hasMore()) { - sources.add(source); - } - } - } - - } - - if (sources.isEmpty()) { - sources.add(new EmptyEntrySource()); - } - - return new IndexQueryBuilder(new IndexQuery(sources)); - } - - public class IndexQueryBuilder { - private final IndexQuery query; - - IndexQueryBuilder(IndexQuery query) { - this.query = query; - } - - public IndexQueryBuilder also(int termId) { - List filters = new ArrayList<>(requiredIndices.size()); - - for (var ri : requiredIndices) { - var range = ri.rangeForWord(termId); - - if (range.isPresent()) { - filters.add(new QueryFilterBTreeRangeRetain(range)); - } - } - if (filters.isEmpty()) { - filters.add(QueryFilterStepIf.noPass()); - } - - - if (filters.size() > 1) { - filters.sort(Comparator.naturalOrder()); - query.addInclusionFilter(QueryFilterStepIf.anyOf(filters)); - } - else { - query.addInclusionFilter(filters.get(0)); - } - - return this; - } - - public void addInclusionFilter(QueryFilterStepIf filter) { - query.addInclusionFilter(filter); - } - - public IndexQueryBuilder not(int termId) { - for (var ri : excludeIndex) { - var range = ri.rangeForWord(termId); - if (range.isPresent()) { - query.addInclusionFilter(new QueryFilterBTreeRangeReject(range)); - } - } - - return this; - } - - public IndexQuery build() { - return query; - } - - } - -} - diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryParams.java deleted file mode 100644 index d157c8da..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryParams.java +++ /dev/null @@ -1,16 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query; - -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService; - -import java.util.List; - -public record IndexQueryParams(IndexBlock block, - EdgeIndexQueryService.SearchTerms searchTerms, - Integer qualityLimit, - Integer rankLimit, - List targetDomains - ) -{ - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java deleted file mode 100644 index c31c3aed..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types; - -import nu.marginalia.util.btree.BTreeQueryBuffer; - -public interface EntrySource { - void skip(int n); - void read(BTreeQueryBuffer buffer); - - boolean hasMore(); -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromBTree.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromBTree.java deleted file mode 100644 index 7641966c..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromBTree.java +++ /dev/null @@ -1,108 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types; - -import nu.marginalia.util.btree.BTreeQueryBuffer; -import nu.marginalia.util.btree.BTreeReader; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; - -import javax.annotation.Nullable; - -import static java.lang.Math.min; -import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.*; - -public class EntrySourceFromBTree implements EntrySource { - @Nullable - private final BTreeReader reader; - private final long metadataBitMask; - private final Integer qualityLimit; - - public static final long NO_MASKING = ~0L; - - int pos; - long endOffset; - - public EntrySourceFromBTree(@Nullable BTreeReader reader, long metadataBitMask, Integer qualityLimit) { - this.reader = reader; - this.metadataBitMask = metadataBitMask; - this.qualityLimit = qualityLimit; - - if (reader != null) { - pos = 0; - endOffset = pos + (long) reader.numEntries() * ENTRY_SIZE; - } - } - - - @Override - public void skip(int n) { - pos += n * ENTRY_SIZE; - } - - @Override - public void read(BTreeQueryBuffer buffer) { - if (reader == null) { - buffer.zero(); - return; - } - - assert buffer.end%ENTRY_SIZE == 0; - - buffer.end = min(buffer.end, (int)(endOffset - pos)); - - reader.readData(buffer.data, buffer.end, pos); - - pos += buffer.end; - - destagger(buffer); - buffer.uniq(); - } - - private void destagger(BTreeQueryBuffer buffer) { - if (metadataBitMask == NO_MASKING && qualityLimit == null) { - for (int i = 0; (i + ENTRY_SIZE - 1) < buffer.end; i += ENTRY_SIZE) { - buffer.data[i / ENTRY_SIZE] = buffer.data[i + ENTRY_URL_OFFSET]; - } - - buffer.end /= ENTRY_SIZE; - } - else { - int write = 0; - - for (int read = 0; read < buffer.end; read+=ENTRY_SIZE) { - final long metadata = buffer.data[read + ENTRY_METADATA_OFFSET]; - - if (isQualityOk(metadata) && isFlagsOk(metadata)) { - buffer.data[write++] = buffer.data[read+ENTRY_URL_OFFSET]; - } - } - - buffer.end = write; - } - } - - private boolean isFlagsOk(long metadata) { - return metadataBitMask == ~0L || EdgePageWordMetadata.hasFlags(metadata, metadataBitMask); - } - - private boolean isQualityOk(long metadata) { - if (qualityLimit == null) - return true; - - final int quality = EdgePageWordMetadata.decodeQuality(metadata); - - if (qualityLimit < 0) - return quality > -qualityLimit; - else - return quality < qualityLimit; - } - - @Override - public boolean hasMore() { - return pos < endOffset; - } - - @Override - public String toString() { - return String.format("BTreeRange.EntrySource(@" + pos + ": " + endOffset + ")"); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromMapRange.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromMapRange.java deleted file mode 100644 index 99cb94d6..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySourceFromMapRange.java +++ /dev/null @@ -1,60 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types; - -import nu.marginalia.util.btree.BTreeQueryBuffer; -import nu.marginalia.util.multimap.MultimapFileLong; - -import static java.lang.Math.min; -import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_SIZE; -import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_URL_OFFSET; - -public class EntrySourceFromMapRange implements EntrySource { - - private final MultimapFileLong map; - private long pos; - private final long endOffset; - - public EntrySourceFromMapRange(MultimapFileLong map, long start, long end) { - this.map = map; - this.pos = start; - this.endOffset = end; - } - - @Override - public void skip(int n) { - pos += (long) n * ENTRY_SIZE; - } - - @Override - public void read(BTreeQueryBuffer buffer) { - - assert buffer.end%ENTRY_SIZE == 0; - - buffer.end = min(buffer.end, (int)(endOffset - pos)); - - map.read(buffer.data, buffer.end, pos); - - pos += buffer.end; - - destagger(buffer); - buffer.uniq(); - } - - private void destagger(BTreeQueryBuffer buffer) { - for (int i = 0; (i + ENTRY_SIZE - 1) < buffer.end; i += ENTRY_SIZE) { - buffer.data[i / ENTRY_SIZE] = buffer.data[i + ENTRY_URL_OFFSET]; - } - - buffer.end /= ENTRY_SIZE; - } - - @Override - public boolean hasMore() { - return pos < endOffset; - } - - @Override - public String toString() { - return String.format("BTreeRange.EntrySourceFromMapRange(@" + pos + ": " + endOffset + ")"); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeReject.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeReject.java deleted file mode 100644 index ed826f10..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeReject.java +++ /dev/null @@ -1,27 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types.filter; - -import nu.marginalia.util.btree.BTreeQueryBuffer; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexURLRange; - -public record QueryFilterBTreeRangeReject(SearchIndexURLRange range) implements QueryFilterStepIf { - - @Override - public void apply(BTreeQueryBuffer buffer) { - range.rejectUrls(buffer); - buffer.finalizeFiltering(); - } - - public boolean test(long id) { - return !range.hasUrl(id); - } - - @Override - public double cost() { - return range.numEntries(); - } - - @Override - public String describe() { - return "Reject: UrlRange[]"; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeRetain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeRetain.java deleted file mode 100644 index c0929076..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRangeRetain.java +++ /dev/null @@ -1,27 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types.filter; - -import nu.marginalia.util.btree.BTreeQueryBuffer; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexURLRange; - -public record QueryFilterBTreeRangeRetain(SearchIndexURLRange range) implements QueryFilterStepIf { - - @Override - public void apply(BTreeQueryBuffer buffer) { - range.retainUrls(buffer); - buffer.finalizeFiltering(); - } - - public boolean test(long id) { - return range.hasUrl(id); - } - - @Override - public double cost() { - return range.numEntries(); - } - - @Override - public String describe() { - return "UrlRange[]"; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryRankLimitingFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryRankLimitingFilter.java deleted file mode 100644 index 69cdc833..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryRankLimitingFilter.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query.types.filter; - -import nu.marginalia.util.btree.BTreeQueryBuffer; - -public class QueryRankLimitingFilter implements QueryFilterStepIf -{ - private final int rankLimit; - - public QueryRankLimitingFilter(int rankLimit) { - this.rankLimit = rankLimit; - } - - @Override - public boolean test(long value) { - long rank = value >>> 32L; - return rank < rankLimit; - } - - @Override - public void apply(BTreeQueryBuffer buffer) { - - while (buffer.hasMore() && test(buffer.currentValue())) { - buffer.retainAndAdvance(); - } - - buffer.finalizeFiltering(); - } - @Override - public double cost() { - return 0; - } - - @Override - public String describe() { - return getClass().getSimpleName(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java new file mode 100644 index 00000000..ceba0d71 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java @@ -0,0 +1,59 @@ +package nu.marginalia.wmsa.edge.index.svc.searchset; + +import org.roaringbitmap.RoaringBitmap; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class RankingSearchSet implements SearchSet { + + private final RoaringBitmap set; + public final SearchSetIdentifier identifier; + public final Path source; + + public RankingSearchSet(SearchSetIdentifier identifier, Path source) throws IOException { + this.identifier = identifier; + this.source = source; + set = new RoaringBitmap(); + + if (!Files.exists(source)) { + return; + } + + try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) { + for (;;) { + try { + set.add(ds.readInt()); + } + catch (IOException ex) { break; } + } + } + } + + public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) { + this.identifier = identifier; + this.source = source; + this.set = set; + } + + @Override + public boolean contains(int urlId) { + return set.contains(urlId); + } + + public void write() throws IOException { + try (var ds = new DataOutputStream(Files.newOutputStream(source, StandardOpenOption.WRITE, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) { + for (var iter = set.getIntIterator(); iter.hasNext();) { + ds.writeInt(iter.next()); + } + } + } + + public String toString() { + return identifier.toString(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSet.java new file mode 100644 index 00000000..8f412374 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSet.java @@ -0,0 +1,6 @@ +package nu.marginalia.wmsa.edge.index.svc.searchset; + +public interface SearchSet { + boolean contains(int urlId); + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetAny.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetAny.java new file mode 100644 index 00000000..dabebb8a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetAny.java @@ -0,0 +1,13 @@ +package nu.marginalia.wmsa.edge.index.svc.searchset; + +public class SearchSetAny implements SearchSet { + @Override + public boolean contains(int urlId) { + return true; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java new file mode 100644 index 00000000..59ffcad4 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java @@ -0,0 +1,8 @@ +package nu.marginalia.wmsa.edge.index.svc.searchset; + +public enum SearchSetIdentifier { + NONE, + RETRO, + ACADEMIA, + SMALLWEB +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java new file mode 100644 index 00000000..0af63fa5 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java @@ -0,0 +1,24 @@ +package nu.marginalia.wmsa.edge.index.svc.searchset; + +import gnu.trove.set.hash.TIntHashSet; + +import java.util.Arrays; +import java.util.Collection; + +public class SmallSearchSet implements SearchSet { + public TIntHashSet entries; + + public SmallSearchSet(Collection domains) { + entries = new TIntHashSet(domains); + } + + @Override + public boolean contains(int urlId) { + return entries.contains(urlId); + } + + public String toString() { + return getClass().getSimpleName() + Arrays.toString(entries.toArray()); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java index 5d207fd4..4e620fc3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/model/BasicDocumentData.java @@ -4,7 +4,7 @@ import lombok.AllArgsConstructor; import lombok.Data; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; @Data @@ -16,7 +16,7 @@ public class BasicDocumentData { public final String description; public int hashCode; - public final EdgePageWordSet words; + public final EdgePageWords words; public final EdgeDomainLink[] domainLinks; public final int wordCount; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java index 839be1ab..141a3904 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java @@ -5,7 +5,6 @@ import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; import nu.marginalia.wmsa.edge.model.EdgeUrl; @@ -44,13 +43,13 @@ public class StackOverflowPostProcessor { } var dld = sentenceExtractor.extractSentences(doc); - var keywords = documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata(-15)); + var keywords = documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata()); - keywords.get(IndexBlock.Meta).addJustNoMeta("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words_1).addJustNoMeta("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words_1).addJustNoMeta("special:wikipedia"); - keywords.get(IndexBlock.Meta).addJustNoMeta("special:wikipedia"); - keywords.get(IndexBlock.Meta).addJustNoMeta("js:true"); + keywords.addJustNoMeta("site:"+post.getUrl().domain); + keywords.addJustNoMeta("site:"+post.getUrl().domain); + keywords.addJustNoMeta("special:wikipedia"); + keywords.addJustNoMeta("special:wikipedia"); + keywords.addJustNoMeta("js:true"); String title = StringUtils.abbreviate(post.getTitle(), 255); String description = StringUtils.abbreviate(Jsoup.parseBodyFragment(post.getJustBody()).text(), 255); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java index f724c0b5..5f4d206c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java @@ -4,7 +4,6 @@ import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; import nu.marginalia.wmsa.edge.model.EdgeUrl; @@ -40,13 +39,12 @@ public class WikipediaProcessor { EdgeDomainLink[] domainLinks = getDomainLinks(docUrl, doc); var dld = sentenceExtractor.extractSentences(doc); - var keywords = documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata(15)); + var keywords = documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata()); - keywords.get(IndexBlock.Meta).addJustNoMeta("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words_1).addJustNoMeta("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words_1).addJustNoMeta("special:stackoverflow"); - keywords.get(IndexBlock.Meta).addJustNoMeta("special:stackoverflow"); - keywords.get(IndexBlock.Meta).addJustNoMeta("js:true"); + keywords.addJustNoMeta("site:"+post.getUrl().domain); + keywords.addJustNoMeta("special:stackoverflow"); + keywords.addJustNoMeta("special:stackoverflow"); + keywords.addJustNoMeta("js:true"); return new BasicDocumentData(docUrl, title, description, post.body.hashCode(), keywords, domainLinks, dld.totalNumWords()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index 58a78e58..e5b0526c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -108,6 +108,30 @@ public class EdgeDomain { return domain.equalsIgnoreCase(other.domain); } + public String getTld() { + int dot = -1; + int length = domain.length(); + + if (ipPatternTest.test(domain)) { + return "IP"; + } + + if (govListTest.test(domain)) { + dot = domain.indexOf('.', Math.max(0, length - ".edu.uk".length())); + } + else { + dot = domain.lastIndexOf('.'); + } + + + if (dot < 0 || dot == domain.length() - 1) { + return "-"; + } + else { + return domain.substring(dot + 1); + } + } + public boolean equals(final Object o) { if (o == this) return true; if (!(o instanceof EdgeDomain)) return false; @@ -135,4 +159,5 @@ public class EdgeDomain { result = result * PRIME + $domain.hashCode(); return result; } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageContent.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageContent.java index 997d25c1..596d389a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageContent.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageContent.java @@ -9,7 +9,7 @@ import java.util.Set; @Data public class EdgePageContent { public final EdgeUrl url; - public final EdgePageWordSet words; + public final EdgePageWords words; public final Map> linkWords; public final EdgePageMetadata metadata; public final int hash; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java deleted file mode 100644 index 242ac5da..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java +++ /dev/null @@ -1,56 +0,0 @@ -package nu.marginalia.wmsa.edge.model.crawl; - -import lombok.Data; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; - -import java.util.*; - -@Data -public class EdgePageWordSet { - public Map wordSets; - - public EdgePageWordSet(EdgePageWords... words) { - wordSets = new EnumMap<>(IndexBlock.class); - for (EdgePageWords w : words) { - wordSets.put(w.block, w); - } - } - - public EdgePageWords get(IndexBlock block) { - var words = wordSets.get(block); - if (words == null) { - words = new EdgePageWords(block); - wordSets.put(block, words); - } - return words; - } - - public void append(IndexBlock block, Collection words) { - wordSets.computeIfAbsent(block, b -> new EdgePageWords(block)).addAll(words); - } - public void appendWithNoMeta(IndexBlock block, Collection words) { - wordSets.computeIfAbsent(block, b -> new EdgePageWords(block)).addAllNoMeta(words); - } - public Collection values() { - return new ArrayList<>(wordSets.values()); - } - - public boolean isEmpty() { - return 0 == wordSets.values().stream().mapToInt(EdgePageWords::size).sum(); - } - - public String toString() { - var sj = new StringJoiner("\n", "EdgePageWordSet:\n", ""); - wordSets.forEach((block, words) -> { - if (words.size() > 0) { - sj.add("\t" + block); - for (int i = 0; i < words.size(); i++) { - sj.add("\t\t" + words.getWords().get(i) + ":" + new EdgePageWordMetadata(words.getMetadata().get(i))); - } - } - }); - return sj.toString(); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java index 2258d764..0db772da 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java @@ -5,24 +5,27 @@ import lombok.Getter; import lombok.ToString; import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Set; +import java.util.function.UnaryOperator; @ToString @Getter -public class EdgePageWords{ - public final IndexBlock block; +public class EdgePageWords { public final ArrayList words = new ArrayList<>(); public final TLongArrayList metadata = new TLongArrayList(); - public EdgePageWords(IndexBlock block) { - this.block = block; + public EdgePageWords() { } - public EdgePageWords(IndexBlock block, Collection initial) { - this.block = block; + + public EdgePageWords(int cacpacity) { + words.ensureCapacity(cacpacity); + metadata.ensureCapacity(cacpacity); + } + + public EdgePageWords(Collection initial) { words.ensureCapacity(initial.size()); metadata.ensureCapacity(initial.size()); @@ -32,14 +35,14 @@ public class EdgePageWords{ } } - public static EdgePageWords withBlankMetadata(IndexBlock block, List entries) { + public static EdgePageWords withBlankMetadata(List entries) { List emptyMeta = new ArrayList<>(entries.size()); for (int i = 0; i < entries.size(); i++) { emptyMeta.add(EdgePageWordMetadata.emptyValue()); } - return new EdgePageWords(block, entries, emptyMeta); + return new EdgePageWords(entries, emptyMeta); } public void addJustNoMeta(String word) { @@ -47,8 +50,7 @@ public class EdgePageWords{ metadata.add(0); } - private EdgePageWords(IndexBlock block, List words, List meta) { - this.block = block; + private EdgePageWords(List words, List meta) { this.words.addAll(words); this.metadata.addAll(meta); @@ -65,6 +67,9 @@ public class EdgePageWords{ } public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set flagWords) { + if (flagWords.isEmpty()) + return; + for (int i = 0; i < words.size(); i++) { if (flagWords.contains(words.get(i))) { metadata.set(i, metadata.get(i) | flag.asBit()); @@ -72,20 +77,43 @@ public class EdgePageWords{ } } - public void addAllNoMeta(Collection newWords) { + public void addAllSyntheticTerms(Collection newWords) { words.ensureCapacity(words.size() + newWords.size()); metadata.ensureCapacity(metadata.size() + newWords.size()); + long meta = EdgePageWordFlags.Synthetic.asBit(); + for (var entry : newWords) { words.add(entry); - metadata.add(0L); + metadata.add(meta); } } + public List getWordsWithAnyFlag(long flags) { + List ret = new ArrayList<>(); + + for (int i = 0; i < words.size(); i++) { + if ((metadata.get(i) & flags) > 0) { + ret.add(words.get(i)); + } + } + + return ret; + } + + public void add(String word, long meta) { + words.add(word); + metadata.add(meta); + } + public int size() { return words.size(); } + public void internalize(UnaryOperator internalizer) { + words.replaceAll(internalizer); + } + public record Entry(String word, long metadata) { } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java index d6d20233..517c1975 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java @@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.AllArgsConstructor; import lombok.Getter; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; @@ -11,17 +10,13 @@ import java.util.List; @AllArgsConstructor @Getter public class EdgeSearchResultItem { - public final int bucketId; - public final IndexBlock block; public final long combinedId; public final List scores; public int resultsFromDomain; - public EdgeSearchResultItem(int bucketId, IndexBlock block, long val) { - this.bucketId = bucketId; - this.block = block; + public EdgeSearchResultItem(long val) { this.combinedId = val; this.scores = new ArrayList<>(16); } @@ -36,7 +31,6 @@ public class EdgeSearchResultItem { public int getRanking() { return (int)(combinedId >>> 32); } - public int getResultsFromDomain() { return resultsFromDomain; } /* Used for evaluation */ private transient double scoreValue = 1; @@ -47,12 +41,20 @@ public class EdgeSearchResultItem { return scoreValue; } + private transient int domainId = 0; + public void setDomainId(int domainId) { + this.domainId = domainId; + } + public int getDomainId() { + return this.domainId; + } + public int hashCode() { return getUrlIdInt(); } public String toString() { - return getClass().getSimpleName() + "[ url= " + getUrlId() + ", rank=" + getRanking() + "; bucket = " + bucketId + "]"; + return getClass().getSimpleName() + "[ url= " + getUrlId() + ", rank=" + getRanking() + "]"; } public boolean equals(Object other) { @@ -67,12 +69,12 @@ public class EdgeSearchResultItem { } public long deduplicationKey() { - final int ranking = getRanking(); + final int ranking = getDomainId(); if (ranking == Integer.MAX_VALUE || ranking == Integer.MIN_VALUE) { return 0; } - return ranking*32L + bucketId; + return ranking; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java index 24406fc3..98bf9444 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java @@ -1,5 +1,7 @@ package nu.marginalia.wmsa.edge.model.search; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; @@ -8,38 +10,50 @@ import static java.lang.Integer.numberOfTrailingZeros; public record EdgeSearchResultKeywordScore(int set, String keyword, - EdgePageWordMetadata metadata) { + long encodedWordMetadata, + long encodedDocMetadata, + boolean hasPriorityTerms) { public double documentValue() { long sum = 0; - sum += metadata.quality() / 5.; - if (metadata.flags().contains(EdgePageWordFlags.Simple)) { + + sum += EdgePageDocumentsMetadata.decodeQuality(encodedDocMetadata) / 5.; + + sum += EdgePageDocumentsMetadata.decodeTopology(encodedDocMetadata); + + if (EdgePageDocumentsMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) { sum += 20; } + + return sum; } + private boolean hasTermFlag(EdgePageWordFlags flag) { + return EdgePageWordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); + } + public double termValue() { double sum = 0; - if (metadata.flags().contains(EdgePageWordFlags.Title)) { + if (hasTermFlag(EdgePageWordFlags.Title)) { sum -= 15; } - if (metadata.flags().contains(EdgePageWordFlags.Site)) { + if (hasTermFlag(EdgePageWordFlags.Site)) { sum -= 10; } - else if (metadata.flags().contains(EdgePageWordFlags.SiteAdjacent)) { + else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) { sum -= 5; } - if (metadata.flags().contains(EdgePageWordFlags.Subjects)) { + if (hasTermFlag(EdgePageWordFlags.Subjects)) { sum -= 10; } - if (metadata.flags().contains(EdgePageWordFlags.NamesWords)) { + if (hasTermFlag(EdgePageWordFlags.NamesWords)) { sum -= 1; } - sum -= metadata.tfIdf() / 50.; + sum -= EdgePageWordMetadata.decodeTfidf(encodedWordMetadata) / 50.; sum += firstPos() / 5.; sum -= Integer.bitCount(positions()) / 3.; @@ -47,9 +61,12 @@ public record EdgeSearchResultKeywordScore(int set, } public int firstPos() { - return numberOfTrailingZeros(lowestOneBit(metadata.positions())); + return numberOfTrailingZeros(lowestOneBit(EdgePageWordMetadata.decodePositions(encodedWordMetadata))); + } + public int positions() { return EdgePageWordMetadata.decodePositions(encodedWordMetadata); } + public boolean isSpecial() { return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); } + public boolean isRegular() { + return !keyword.contains(":") + && !hasTermFlag(EdgePageWordFlags.Synthetic); } - public int positions() { return metadata.positions(); } - public boolean isSpecial() { return keyword.contains(":"); } - public boolean isRegular() { return !keyword.contains(":"); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java index c39ea7fe..a1289a42 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java @@ -1,18 +1,18 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.*; +import nu.marginalia.wmsa.edge.index.model.QueryStrategy; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; +import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; -import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; @ToString @Getter @Builder @With @AllArgsConstructor public class EdgeSearchSpecification { - public List buckets; public List subqueries; public List domains; + public SearchSetIdentifier searchSetIdentifier; public final int limitByDomain; public final int limitTotal; @@ -22,14 +22,10 @@ public class EdgeSearchSpecification { public final int timeoutMs; public final int fetchSize; - public final Integer quality; - public final Integer rank; + public final SpecificationLimit quality; + public final SpecificationLimit year; + public final SpecificationLimit size; + + public final QueryStrategy queryStrategy; - public List allIncludeSearchTerms() { - Set searchTerms = new HashSet<>(64); - for (var query : subqueries) { - searchTerms.addAll(query.searchTermsInclude); - } - return new ArrayList<>(searchTerms); - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java index 2784d495..b171c5a3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java @@ -3,10 +3,8 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import java.util.List; -import java.util.concurrent.CopyOnWriteArrayList; @ToString @Getter @@ -16,23 +14,19 @@ public class EdgeSearchSubquery { public final List searchTermsInclude; public final List searchTermsExclude; public final List searchTermsAdvice; - public final IndexBlock block; + public final List searchTermsPriority; private double value = 0; - public EdgeSearchSubquery(List searchTermsInclude, List searchTermsExclude, List searchTermsAdvice, IndexBlock block) { + public EdgeSearchSubquery(List searchTermsInclude, + List searchTermsExclude, + List searchTermsAdvice, + List searchTermsPriority + ) { this.searchTermsInclude = searchTermsInclude; this.searchTermsExclude = searchTermsExclude; this.searchTermsAdvice = searchTermsAdvice; - this.block = block; - } - - public EdgeSearchSubquery withBlock(IndexBlock block) { - return new EdgeSearchSubquery( - new CopyOnWriteArrayList<>(searchTermsInclude), - new CopyOnWriteArrayList<>(searchTermsExclude), - new CopyOnWriteArrayList<>(searchTermsAdvice), - block).setValue(value); + this.searchTermsPriority = searchTermsPriority; } public EdgeSearchSubquery setValue(double value) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java index bc2cbd88..2c4738f9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java @@ -2,13 +2,10 @@ package nu.marginalia.wmsa.edge.model.search.domain; import lombok.AllArgsConstructor; import lombok.ToString; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; @ToString @AllArgsConstructor public class EdgeDomainSearchSpecification { - public final int bucket; - public final IndexBlock block; public final String keyword; public final int queryDepth; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimit.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimit.java new file mode 100644 index 00000000..5a9a587b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimit.java @@ -0,0 +1,31 @@ +package nu.marginalia.wmsa.edge.model.search.domain; + +public record SpecificationLimit(SpecificationLimitType type, int value) { + public static SpecificationLimit none() { + return new SpecificationLimit(SpecificationLimitType.NONE, 0); + } + + public static SpecificationLimit equals(int value) { + return new SpecificationLimit(SpecificationLimitType.EQUALS, value); + } + + public static SpecificationLimit lessThan(int value) { + return new SpecificationLimit(SpecificationLimitType.LESS_THAN, value); + } + + public static SpecificationLimit greaterThan(int value) { + return new SpecificationLimit(SpecificationLimitType.GREATER_THAN, value); + } + + public boolean test(int parameter) { + if (type == SpecificationLimitType.NONE) + return true; + if (type == SpecificationLimitType.EQUALS) + return parameter == value; + if (type == SpecificationLimitType.GREATER_THAN) + return parameter >= value; + if (type == SpecificationLimitType.LESS_THAN) + return parameter <= value; + throw new AssertionError("Unknown type " + type); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimitType.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimitType.java new file mode 100644 index 00000000..24c2fd12 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/SpecificationLimitType.java @@ -0,0 +1,8 @@ +package nu.marginalia.wmsa.edge.model.search.domain; + +public enum SpecificationLimitType { + NONE, + EQUALS, + LESS_THAN, + GREATER_THAN +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index 484f518f..64c8346a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -7,7 +7,7 @@ import io.reactivex.rxjava3.schedulers.Schedulers; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; import nu.marginalia.wmsa.edge.search.model.BrowseResult; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java index 91fe9c7e..46a8a437 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/IndexCommand.java @@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.search.command; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.search.model.BrowseResultSet; import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java index c6434bdf..e1d256d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java @@ -2,9 +2,8 @@ package nu.marginalia.wmsa.edge.search.command.commands; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java index 3b1ddab6..ccbb91ec 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java @@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.search.command.commands; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java index 76b24d51..11a564b4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteListCommand.java @@ -2,8 +2,7 @@ package nu.marginalia.wmsa.edge.search.command.commands; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; @@ -60,7 +59,7 @@ public class SiteListCommand implements SearchCommandInterface { Path screenshotPath = null; Integer domainId = -1; if (null != domain) { - resultSet = searchQueryIndexService.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain); + resultSet = searchQueryIndexService.performDumbQuery(ctx, EdgeSearchProfile.CORPO, 100, 100, "site:"+domain); domainId = dataStoreDao.getDomainId(domain).id(); screenshotPath = Path.of("/screenshot/" + domainId); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java index 73831886..9f97a7f7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/EdgeSearchProfile.java @@ -1,40 +1,34 @@ package nu.marginalia.wmsa.edge.search.model; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; +import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; -import java.util.Arrays; -import java.util.List; import java.util.Objects; -import java.util.stream.Collectors; public enum EdgeSearchProfile { - DEFAULT("default", SearchOrder.DEFAULT_ORDER, 0, 1), - MODERN("modern", SearchOrder.DEFAULT_ORDER, 2), - CORPO("corpo", SearchOrder.DEFAULT_ORDER, 4, 5, 7), - YOLO("yolo", SearchOrder.DEFAULT_ORDER, 0, 2, 1, 3, 6), - CORPO_CLEAN("corpo-clean", SearchOrder.DEFAULT_ORDER, 0, 1), - ACADEMIA("academia", SearchOrder.DEFAULT_ORDER, 3), + DEFAULT("default", SearchSetIdentifier.RETRO), + MODERN("modern", SearchSetIdentifier.SMALLWEB), + CORPO("corpo", SearchSetIdentifier.NONE), + YOLO("yolo", SearchSetIdentifier.NONE), + CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE), + ACADEMIA("academia", SearchSetIdentifier.ACADEMIA), - FOOD("food", SearchOrder.DEFAULT_ORDER, 2, 0), - CRAFTS("crafts", SearchOrder.DEFAULT_ORDER, 2, 0), + FOOD("food", SearchSetIdentifier.NONE), + CRAFTS("crafts", SearchSetIdentifier.NONE), - CLASSICS("classics", SearchOrder.DEFAULT_ORDER, 4, 5, 7), + CLASSICS("classics", SearchSetIdentifier.NONE), ; public final String name; - public final List buckets; - public final List indexBlocks; + public final SearchSetIdentifier searchSetIdentifier; - EdgeSearchProfile(String name, - List indexBlocks, - int... buckets) { + EdgeSearchProfile(String name, SearchSetIdentifier searchSetIdentifier) { this.name = name; - this.indexBlocks = indexBlocks; - this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList()); + this.searchSetIdentifier = searchSetIdentifier; } private final static EdgeSearchProfile[] values = values(); @@ -53,6 +47,9 @@ public enum EdgeSearchProfile { } public void addTacitTerms(EdgeSearchSubquery subquery) { + if (this == ACADEMIA) { + subquery.searchTermsPriority.add("tld:edu"); + } if (this == FOOD) { subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); } @@ -61,6 +58,30 @@ public enum EdgeSearchProfile { } } + public SpecificationLimit getYearLimit() { + if (this == MODERN) { + return SpecificationLimit.greaterThan(2015); + } + else return SpecificationLimit.none(); + } + + public SpecificationLimit getSizeLimit() { + if (this == MODERN) { + return SpecificationLimit.lessThan(500); + } + else return SpecificationLimit.none(); + } + + + public SpecificationLimit getQualityLimit() { + if (this == MODERN) { + return SpecificationLimit.lessThan(5); + } + else return SpecificationLimit.none(); + } + + + public String getNearDomain() { if (this == CLASSICS) { return "classics.mit.edu"; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/SearchOrder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/SearchOrder.java deleted file mode 100644 index 831fdbae..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/SearchOrder.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.wmsa.edge.search.model; - -import nu.marginalia.wmsa.edge.index.model.IndexBlock; - -import java.util.List; - -class SearchOrder { - static List DEFAULT_ORDER - = List.of(IndexBlock.Title, IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index 444c7845..7ed02926 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -6,10 +6,10 @@ import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.model.QueryStrategy; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; -import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; +import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; import nu.marginalia.wmsa.edge.search.valuation.SearchResultValuator; @@ -43,7 +43,7 @@ public class QueryFactory { this.searchResultValuator = searchResultValuator; this.nearQueryProcessor = nearQueryProcessor; - this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary)); + this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary)); } public QueryParser getParser() { @@ -52,49 +52,27 @@ public class QueryFactory { public EdgeSearchQuery createQuery(EdgeUserSearchParameters params) { final var processedQuery = createQuery(getParser(), params); + final List subqueries = processedQuery.specs.subqueries; - final var newSubqueries = reevaluateSubqueries(processedQuery, params); + for (var sq : subqueries) { + sq.setValue(searchResultValuator.preEvaluate(sq)); + } - processedQuery.specs.subqueries.clear(); - processedQuery.specs.subqueries.addAll(newSubqueries); + subqueries.sort(Comparator.comparing(EdgeSearchSubquery::getValue)); + trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT); return processedQuery; } - private List reevaluateSubqueries(EdgeSearchQuery processedQuery, EdgeUserSearchParameters params) { - final var profile = params.profile(); - - for (var sq : processedQuery.specs.subqueries) { - sq.setValue(searchResultValuator.preEvaluate(sq)); - } - - trimExcessiveSubqueries(processedQuery.specs.subqueries); - - List subqueries = - new ArrayList<>(processedQuery.specs.subqueries.size() * profile.indexBlocks.size()); - - for (var sq : processedQuery.specs.subqueries) { - for (var block : profile.indexBlocks) { - subqueries.add(sq.withBlock(block).setValue(sq.getValue() * block.ordinal())); - } - } - - subqueries.sort(Comparator.comparing(EdgeSearchSubquery::getValue)); - - return subqueries; - } - - private void trimExcessiveSubqueries(List subqueries) { - - subqueries.sort(Comparator.comparing(EdgeSearchSubquery::getValue)); - - if (subqueries.size() > RETAIN_QUERY_VARIANT_COUNT) { - subqueries.subList(0, subqueries.size() - RETAIN_QUERY_VARIANT_COUNT).clear(); + private void trimArray(List arr, int maxSize) { + if (arr.size() > maxSize) { + arr.subList(0, arr.size() - maxSize).clear(); } } - - public EdgeSearchQuery createQuery(QueryParser queryParser, EdgeUserSearchParameters params) { + public EdgeSearchQuery createQuery(QueryParser queryParser, + EdgeUserSearchParameters params) + { final var query = params.humanQuery(); final var profile = params.profile(); @@ -113,8 +91,9 @@ public class QueryFactory { basicQuery.clear(); } - Integer qualityLimit = null; - Integer rankLimit = null; + SpecificationLimit qualityLimit = profile.getQualityLimit(); + SpecificationLimit year = profile.getYearLimit(); + SpecificationLimit size = profile.getSizeLimit(); for (Token t : basicQuery) { if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) { @@ -126,28 +105,27 @@ public class QueryFactory { analyzeSearchTerm(problems, t); } if (t.type == TokenType.QUALITY_TERM) { - qualityLimit = Integer.parseInt(t.str); + qualityLimit = parseSpecificationLimit(t.str); } - if (t.type == TokenType.RANK_TERM) { - if (profile == EdgeSearchProfile.CORPO) { - problems.add("Rank limit (" + t.displayStr + ") ignored in unranked query"); - } else { - rankLimit = Integer.parseInt(t.str); - } + if (t.type == TokenType.YEAR_TERM) { + year = parseSpecificationLimit(t.str); + } + if (t.type == TokenType.SIZE_TERM) { + size = parseSpecificationLimit(t.str); } } - - var queryPermutations = queryParser.permuteQueriesNew(basicQuery); List subqueries = new ArrayList<>(); String near = profile.getNearDomain(); + for (var parts : queryPermutations) { List searchTermsExclude = new ArrayList<>(); List searchTermsInclude = new ArrayList<>(); List searchTermsAdvice = new ArrayList<>(); + List searchTermsPriority = new ArrayList<>(); for (Token t : parts) { switch (t.type) { @@ -157,6 +135,9 @@ public class QueryFactory { case ADVICE_TERM: searchTermsAdvice.add(t.str); break; + case PRIORTY_TERM: + searchTermsPriority.add(t.str); + break; case LITERAL_TERM: // fallthrough; case QUOT_TERM: searchTermsInclude.add(t.str); @@ -165,16 +146,24 @@ public class QueryFactory { } break; case QUALITY_TERM: + case YEAR_TERM: + case SIZE_TERM: break; // case NEAR_TERM: near = t.str; break; + default: logger.warn("Unexpected token type {}", t); } } - EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, IndexBlock.Title); + if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { + searchTermsInclude.addAll(searchTermsAdvice); + searchTermsAdvice.clear(); + } + + EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority); params.profile().addTacitTerms(subquery); params.jsSetting().addTacitTerms(subquery); @@ -190,22 +179,18 @@ public class QueryFactory { } } - if (qualityLimit != null && domains.isEmpty()) { - problems.add("Quality limit will be ignored when combined with 'near:'"); - } - - var buckets = domains.isEmpty() ? profile.buckets : EdgeSearchProfile.CORPO.buckets; - EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder() .subqueries(subqueries) .limitTotal(100) .humanQuery(query) - .buckets(buckets) .timeoutMs(250) .fetchSize(4096) .quality(qualityLimit) - .rank(rankLimit) - .domains(domains); + .year(year) + .size(size) + .domains(domains) + .queryStrategy(QueryStrategy.AUTO) + .searchSetIdentifier(profile.searchSetIdentifier); if (domain != null) { specsBuilder = specsBuilder.limitByDomain(100); @@ -218,6 +203,24 @@ public class QueryFactory { return new EdgeSearchQuery(specs, searchTermsHuman, domain); } + private SpecificationLimit parseSpecificationLimit(String str) { + int startChar = str.charAt(0); + + int val = Integer.parseInt(str.substring(1)); + if (startChar == '=') { + return SpecificationLimit.equals(val); + } + else if (startChar == '<'){ + return SpecificationLimit.lessThan(val); + } + else if (startChar == '>'){ + return SpecificationLimit.greaterThan(val); + } + else { + return SpecificationLimit.none(); + } + } + private String normalizeDomainName(String str) { return str.toLowerCase(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java index 21399588..354ba0ce 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.search.query; import lombok.EqualsAndHashCode; import lombok.ToString; +import nu.marginalia.util.TransformList; import nu.marginalia.util.language.WordPatterns; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -10,6 +11,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; +import java.util.function.Predicate; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -29,76 +31,83 @@ public class QueryParser { public List parse(String query) { List basicTokens = extractBasicTokens(query); - List parsedTokens = new ArrayList<>(basicTokens.size()); - for (int i = 0; i < basicTokens.size(); i++) { - var t = basicTokens.get(i); + TransformList list = new TransformList<>(basicTokens); - if (t.type == TokenType.QUOT) { - parsedTokens.add(new Token(TokenType.QUOT_TERM, - t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), - t.displayStr)); - } - else if (t.type == TokenType.LITERAL_TERM - && (t.str.endsWith(":")||t.str.endsWith(".")) - && t.str.length() > 1) - { - parsedTokens.add(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length()-1), t.displayStr)); - } + list.transformEach(QueryParser::handleQuoteTokens); + list.transformEach(QueryParser::trimLiterals); + list.transformEachPair(QueryParser::createNegatedTerms); + list.transformEachPair(QueryParser::createPriorityTerms); + list.transformEach(QueryParser::handleSpecialOperations); + list.scanAndTransform(TokenType.LPAREN, TokenType.RPAREN, QueryParser::handleAdvisoryTerms); + + return list.getBackingList(); + } + + private static void handleQuoteTokens(TransformList.Entity entity) { + var t = entity.value; + if (t.type == TokenType.QUOT) { + entity.replace(new Token(TokenType.QUOT_TERM, + t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), + t.displayStr)); + } + } + + private static void trimLiterals(TransformList.Entity entity) { + var t = entity.value; + + if (t.type == TokenType.LITERAL_TERM + && (t.str.endsWith(":") || t.str.endsWith(".")) + && t.str.length() > 1) { + entity.replace(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length() - 1), t.displayStr)); } - for (int i = 0; i < basicTokens.size() - 1; i++) { - var t = basicTokens.get(i); - var tn = basicTokens.get(i+1); + } - if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) { - parsedTokens.add(new Token(TokenType.EXCLUDE_TERM, tn.str, "-"+tn.str)); - i++; + private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value; + var tn = second.value; + + if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) { + first.remove(); + second.replace(new Token(TokenType.EXCLUDE_TERM, tn.str, "-" + tn.str)); + } + } + private static void createPriorityTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value; + var tn = second.value; + + if (t.type == TokenType.QMARK && tn.type == TokenType.LITERAL_TERM) { + first.remove(); + second.replace(new Token(TokenType.PRIORTY_TERM, tn.str, "?" + tn.str)); + } + } + private static void handleSpecialOperations(TransformList.Entity entity) { + var t = entity.value; + if (t.type == TokenType.LITERAL_TERM) { + if (t.str.startsWith("q") && t.str.matches("q[=><]\\d+")) { + entity.replace(new Token(TokenType.QUALITY_TERM, t.str.substring(1), t.displayStr)); + } else if (t.str.startsWith("near:")) { + entity.replace(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr)); + } else if (t.str.startsWith("year") && t.str.matches("year[=><]\\d{4}")) { + entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr)); + } else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) { + entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); + } else if (t.str.contains(":")) { + entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr)); } } + } - for (int i = 0; i < basicTokens.size(); i++) { - var t = basicTokens.get(i); - - if (t.type == TokenType.LITERAL_TERM) { - if (t.str.startsWith("q:") && t.str.matches("q:[+-]?\\d+")) { - parsedTokens.add(new Token(TokenType.QUALITY_TERM, t.str.substring(2), t.displayStr)); - } - else if (t.str.startsWith("r:") && t.str.matches("r:\\d+")) { - parsedTokens.add(new Token(TokenType.RANK_TERM, t.str.substring(2), t.displayStr)); - } - else if (t.str.startsWith("near:")) { - parsedTokens.add(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr)); - } - else { - parsedTokens.add(t); - } - continue; - } - else if (t.type != TokenType.LPAREN) { - continue; - } - - int end = i+1; - for (; end < basicTokens.size(); end++) { - if (basicTokens.get(end).type == TokenType.RPAREN) { - break; - } - } - if (end == basicTokens.size()) { - continue; - } - - for (int j = i+1; j < end; j++) { - var tok = basicTokens.get(j); - if (tok.type == TokenType.LITERAL_TERM) { - parsedTokens.add(new Token(TokenType.ADVICE_TERM, tok.str, "(" + tok.str + ")")); - } - } - i = end; + private static void handleAdvisoryTerms(TransformList.Entity entity) { + var t = entity.value; + if (t.type == TokenType.LPAREN) { + entity.remove(); + } else if (t.type == TokenType.RPAREN) { + entity.remove(); + } else if (t.type == TokenType.LITERAL_TERM) { + entity.replace(new Token(TokenType.ADVICE_TERM, t.str, "(" + t.str + ")")); } - - return parsedTokens; } private static final Pattern noisePattern = Pattern.compile("[,]"); @@ -138,7 +147,10 @@ public class QueryParser { i = end; } else if ('-' == chr) { - tokens.add(new Token(TokenType.MINUS, "\"")); + tokens.add(new Token(TokenType.MINUS, "-")); + } + else if ('?' == chr) { + tokens.add(new Token(TokenType.QMARK, "?")); } else if (Character.isSpaceChar(chr)) { // @@ -268,6 +280,7 @@ public class QueryParser { List> queryVariants = new ArrayList<>(); for (var query : result.faithful) { var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); + tokens.addAll(result.nonLiterals); queryVariants.add(tokens); } @@ -276,6 +289,7 @@ public class QueryParser { break; var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); + tokens.addAll(result.nonLiterals); queryVariants.add(tokens); } @@ -456,7 +470,7 @@ outer: for (int i = size - 1; i >= 1; i--) { @ToString @EqualsAndHashCode class Token { - public final TokenType type; + public TokenType type; public String str; public final String displayStr; @@ -479,7 +493,7 @@ class Token { } } -enum TokenType { +enum TokenType implements Predicate { TERM, @@ -487,14 +501,22 @@ enum TokenType { QUOT_TERM, EXCLUDE_TERM, ADVICE_TERM, + PRIORTY_TERM, QUALITY_TERM, - RANK_TERM, - + YEAR_TERM, + SIZE_TERM, NEAR_TERM, QUOT, MINUS, + QMARK, LPAREN, - RPAREN + RPAREN, + + IGNORE; + + public boolean test(Token t) { + return t.type == this; + } } \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java index 7956af07..f3e75f91 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java @@ -66,19 +66,22 @@ public class QueryVariants { final List faithful = new ArrayList<>(); final List alternative = new ArrayList<>(); + final List nonLiterals = new ArrayList<>(); + public boolean isEmpty() { - return faithful.isEmpty() && alternative.isEmpty(); + return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); } } public QueryVariantSet getQueryVariants(List query) { - final String queryAsString = joinQuery(query); + final JoinedQueryAndNonLiteralTokens joinedQuery = joinQuery(query); final TreeMap> byStart = new TreeMap<>(); - logger.debug("QAS: {}", queryAsString); + logger.debug("Q: {}", query); + logger.debug("QAS: {}", joinedQuery); - var sentence = sentenceExtractor.extractSentence(queryAsString); + var sentence = sentenceExtractor.extractSentence(joinedQuery.joinedQuery); for (int i = 0; i < sentence.posTags.length; i++) { if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) { @@ -150,6 +153,8 @@ public class QueryVariants { returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue)); returnValue.alternative.sort(Comparator.comparing(QueryVariant::getValue)); + returnValue.nonLiterals.addAll(joinedQuery.nonLiterals); + return returnValue; } @@ -327,39 +332,6 @@ public class QueryVariants { return goodSpans; } - private List> swapTerms(List span) { - List> ret = new ArrayList<>(); - - for (int i = 0; i < span.size()-1; i++) { - var a = span.get(i); - var b = span.get(i+1); - - var stemmed = b.stemmed + "_" + a.stemmed; - - if (dict.getTermFreqStemmed(stemmed) > 0) { - List asTokens = new ArrayList<>(); - - for (int j = 0; j < i; j++) { - var word = span.get(j).word; - asTokens.add(word); - } - { - var word = b.word + "_" + a.word; - asTokens.add(word); - } - for (int j = i+2; j < span.size(); j++) { - var word = span.get(j).word; - asTokens.add(word); - } - - ret.add(asTokens); - } - } - - return ret; - } - - private List> joinTerms(List span) { List> ret = new ArrayList<>(); @@ -393,13 +365,21 @@ public class QueryVariants { return ret; } - private String joinQuery(List query) { + private JoinedQueryAndNonLiteralTokens joinQuery(List query) { StringJoiner s = new StringJoiner(" "); + List leftovers = new ArrayList<>(5); for (var t : query) { - s.add(t.displayStr); + if (t.type == TokenType.LITERAL_TERM) { + s.add(t.displayStr); + } + else { + leftovers.add(t); + } } - return s.toString(); + return new JoinedQueryAndNonLiteralTokens(s.toString(), leftovers); } + + record JoinedQueryAndNonLiteralTokens(String joinedQuery, List nonLiterals) {} } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java index e6235a74..6eca931a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java @@ -3,13 +3,13 @@ package nu.marginalia.wmsa.edge.search.results; import com.google.inject.Inject; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntObjectHashMap; +import it.unimi.dsi.fastutil.ints.Int2IntArrayMap; import nu.marginalia.util.BrailleBlockPunchCards; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.id.EdgeIdList; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; import nu.marginalia.wmsa.edge.search.valuation.SearchResultValuator; import org.slf4j.Logger; @@ -77,16 +77,24 @@ public class SearchResultDecorator { } private String getPositions(EdgeSearchResultItem resultItem) { - int bits = resultItem.scores.stream() - .filter(EdgeSearchResultKeywordScore::isRegular) - .mapToInt(EdgeSearchResultKeywordScore::positions) - .reduce(this::or) - .orElse(0); + Int2IntArrayMap positionsPerSet = new Int2IntArrayMap(8); + + for (var score : resultItem.scores) { + if (!score.isRegular()) { + continue; + } + positionsPerSet.merge(score.set(), score.positions(), this::and); + } + + int bits = positionsPerSet.values().intStream().reduce(this::or).orElse(0); return BrailleBlockPunchCards.printBits(bits, 32); } + private int and(int a, int b) { + return a & b; + } private int or(int a, int b) { return a | b; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/UrlDeduplicator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/UrlDeduplicator.java index ad12d68b..603731a7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/UrlDeduplicator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/UrlDeduplicator.java @@ -1,6 +1,5 @@ package nu.marginalia.wmsa.edge.search.results; -import com.google.common.base.Strings; import gnu.trove.map.hash.TObjectIntHashMap; import gnu.trove.set.hash.TIntHashSet; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; @@ -8,11 +7,11 @@ import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; public class UrlDeduplicator { private final TIntHashSet seenSuperficialhashes = new TIntHashSet(200); private final TIntHashSet seenDataHashes = new TIntHashSet(200); - private final TObjectIntHashMap ipCount = new TObjectIntHashMap<>(200, 0.75f, 0); + private final TObjectIntHashMap keyCount = new TObjectIntHashMap<>(200, 0.75f, 0); - private final int resultsPerIp; - public UrlDeduplicator(int resultsPerIp) { - this.resultsPerIp = resultsPerIp; + private final int resultsPerKey; + public UrlDeduplicator(int resultsPerKey) { + this.resultsPerKey = resultsPerKey; } public boolean shouldRemove(EdgeUrlDetails details) { @@ -25,20 +24,16 @@ public class UrlDeduplicator { if (!seenDataHashes.add(details.getDataHash())) { return false; } - if (Strings.isNullOrEmpty(details.getIp())) { - final var domain = details.getUrl().getDomain(); - final String key; + final var domain = details.getUrl().getDomain(); + final String key; - if (!details.isSpecialDomain()) { - key = domain.getLongDomainKey(); - } - else { - key = domain.getDomainKey(); - } - - return ipCount.adjustOrPutValue(key, 1, 1) <= resultsPerIp; + if (!details.isSpecialDomain()) { + key = domain.getLongDomainKey(); + } + else { + key = domain.getDomainKey(); } - return ipCount.adjustOrPutValue(details.getIp(), 1, 1) < resultsPerIp; + return keyCount.adjustOrPutValue(key, 1, 1) < resultsPerKey; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index d12b4a41..dce74c28 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.search.siteinfo; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.id.EdgeId; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java index f2421ded..694ebc5e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchAddToCrawlQueueService.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.search.svc; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.configuration.WebsiteUrl; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.model.id.EdgeId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java index 40e61b11..74e8681c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchDomainSearchService.java @@ -2,9 +2,8 @@ package nu.marginalia.wmsa.edge.search.svc; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeIdList; import nu.marginalia.wmsa.edge.model.id.EdgeIdSet; @@ -35,13 +34,11 @@ public class EdgeSearchDomainSearchService { if (keywords.isEmpty()) return Collections.emptyList(); - List requests = new ArrayList<>(keywords.size() * specs.buckets.size()); + List requests = new ArrayList<>(keywords.size()); for (var keyword : keywords) { - for (var bucket : specs.buckets) { - requests.add(new EdgeDomainSearchSpecification(bucket, IndexBlock.Link, keyword, - 1_000_000, 3, 25)); - } + requests.add(new EdgeDomainSearchSpecification(keyword, + 1_000_000, 3, 25)); } EdgeIdSet dedup = new EdgeIdSet<>(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java index 93e61562..f5d6c0aa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java @@ -4,8 +4,9 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.model.QueryStrategy; import nu.marginalia.wmsa.edge.model.search.*; +import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator; @@ -32,12 +33,30 @@ public class EdgeSearchQueryIndexService { .thenComparing(EdgeUrlDetails::getId); } - public List performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) { + public List performDumbQuery(Context ctx, EdgeSearchProfile profile, int limitPerDomain, int limitTotal, String... termsInclude) { List sqs = new ArrayList<>(); - sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), Collections.emptyList(), block)); + sqs.add(new EdgeSearchSubquery( + Arrays.asList(termsInclude), + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList() + )); - EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, Collections.emptyList(), limitPerDomain, limitTotal, "", 150, 2048, null, null); + var specs = EdgeSearchSpecification.builder() + .subqueries(sqs) + .domains(Collections.emptyList()) + .searchSetIdentifier(profile.searchSetIdentifier) + .limitByDomain(limitPerDomain) + .limitTotal(limitTotal) + .humanQuery("") + .timeoutMs(150) + .fetchSize(2048) + .year(SpecificationLimit.none()) + .size(SpecificationLimit.none()) + .quality(SpecificationLimit.none()) + .queryStrategy(QueryStrategy.AUTO) + .build(); return performQuery(ctx, new EdgeSearchQuery(specs)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java index 76a74408..7ea78619 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java @@ -5,7 +5,7 @@ import com.google.inject.Singleton; import nu.marginalia.util.language.WordPatterns; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; import org.jetbrains.annotations.NotNull; @@ -26,16 +26,17 @@ public class SearchResultValuator { private static final int MIN_LENGTH = 2000; private static final int AVG_LENGTH = 5000; + private final int docCount; @Inject public SearchResultValuator(TermFrequencyDict dict) { this.dict = dict; + docCount = dict.docCount(); } public double preEvaluate(EdgeSearchSubquery sq) { final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new); - final IndexBlock index = sq.block; double termSum = 0.; double factorSum = 0.; @@ -46,7 +47,10 @@ public class SearchResultValuator { final double factor = 1. / (1.0 + weights[i]); factorSum += factor; - termSum += (index.ordinal() + 0.5) * factor; + termSum += factor; // fixme + + // This logic is the casualty of refactoring. It is intended to prioritize search queries + // according to sum-of-idf, but right now it uses many CPU cycles to always calculate the value 1. } return termSum / factorSum; @@ -55,28 +59,41 @@ public class SearchResultValuator { public double evaluateTerms(List rawScores, int length, int titleLength) { int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0); - double bestPosFactor = 10; double bestScore = 10; double bestAllTermsFactor = 1.; + final double priorityTermBonus; + + if (hasPriorityTerm(rawScores)) { + priorityTermBonus = 0.5; + } + else { + priorityTermBonus = 1; + } + for (int set = 0; set <= sets; set++) { SearchResultsKeywordSet keywordSet = createKeywordSet(rawScores, set); if (keywordSet == null) continue; - final double lengthPenalty = getLengthPenalty(length); - - final double bm25Factor = getBM25(keywordSet, lengthPenalty); + final double bm25Factor = getBM25(keywordSet, length); final double minCountFactor = getMinCountFactor(keywordSet); - final double posFactor = posFactor(keywordSet); bestScore = min(bestScore, bm25Factor * minCountFactor); - bestPosFactor = min(bestPosFactor, posFactor); + bestAllTermsFactor = min(bestAllTermsFactor, getAllTermsFactorForSet(keywordSet, titleLength)); + } - return (0.7 + 0.3 * bestPosFactor) * bestScore * (0.3 + 0.7 * bestAllTermsFactor); + return bestScore * (0.3 + 0.7 * bestAllTermsFactor) * priorityTermBonus; + } + + private boolean hasPriorityTerm(List rawScores) { + return rawScores.stream() + .findAny() + .map(EdgeSearchResultKeywordScore::hasPriorityTerms) + .orElse(false); } private double getMinCountFactor(SearchResultsKeywordSet keywordSet) { @@ -85,55 +102,37 @@ public class SearchResultValuator { int min = 32; for (var keyword : keywordSet) { - min = min(min, keyword.count()); + if (!keyword.wordMetadata.hasFlag(EdgePageWordFlags.Title) && keyword.score.isRegular()) { + min = min(min, keyword.count()); + } } if (min <= 1) return 2; - if (min <= 2) return 1; - if (min <= 3) return 0.75; - return 0.5; + if (min <= 2) return 1.5; + if (min <= 3) return 1.25; + return 1; } - private double getBM25(SearchResultsKeywordSet keywordSet, double lengthPenalty) { + private double getBM25(SearchResultsKeywordSet keywordSet, int length) { + final double scalingFactor = 750.; - // This is a fairly bastardized BM25; the weight factors below are used to - // transform it on a scale from 0 ... 10; where 0 is best, 10+ is worst. - // - // ... for historical reasons - // + final double wf1 = 0.7; + double k = 2; - final double wf1 = 1.0; - final double wf2 = 2000.; - - double termSum = 0.; - double factorSum = 0.; + double sum = 0.; for (var keyword : keywordSet) { - double tfIdf = Math.min(255, keyword.tfIdf()); - final double factor = 1.0 / (1.0 + keyword.weight()); + double count = Math.min(255, keyword.count()); + double wt = keyword.weight() * keyword.weight() / keywordSet.length(); - factorSum += factor; - termSum += (1 + wf1*tfIdf) * factor; + final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5)); + + sum += invFreq * (count * (k + 1)) / (count + k * (1 - wf1 + wf1 * AVG_LENGTH/length)); } - termSum /= lengthPenalty; - - return Math.sqrt(wf2 / (termSum / factorSum)); + return Math.sqrt(scalingFactor / sum); } - private double posFactor(SearchResultsKeywordSet keywordSet) { - // Penalize keywords that first appear late in the document - - double avgPos = 0; - for (var keyword : keywordSet) { - avgPos += keyword.score().firstPos(); - } - avgPos /= keywordSet.length(); - - return Math.sqrt(1 + avgPos / 3.); - } - - private double getAllTermsFactorForSet(SearchResultsKeywordSet set, int titleLength) { double totalFactor = 1.; @@ -146,9 +145,55 @@ public class SearchResultValuator { totalFactor *= getAllTermsFactor(keyword, totalWeight, titleLength); } + totalFactor = calculateTermCoherencePenalty(set, totalFactor); + return totalFactor; } + private double calculateTermCoherencePenalty(SearchResultsKeywordSet keywordSet, double f) { + long maskDirect = ~0; + long maskAdjacent = ~0; + byte excludeMask = (byte) (EdgePageWordFlags.Title.asBit() | EdgePageWordFlags.Subjects.asBit() | EdgePageWordFlags.Synthetic.asBit()); + + for (var keyword : keywordSet.keywords) { + var meta = keyword.wordMetadata; + long positions; + + if (meta.isEmpty()) { + return f; + } + + positions = meta.positions(); + + if (!EdgePageWordMetadata.hasAnyFlags(meta.flags(), excludeMask)) + { + maskDirect &= positions; + maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); + } + } + + if (maskAdjacent == 0) { + return 1.2 * f; + } + + if (maskDirect == 0) { + return 1.1 * f; + } + + + if (maskDirect != ~0L) { + double locationFactor = 0.65 + Math.max(0., + 0.35 * Long.numberOfTrailingZeros(maskDirect) / 16. + - Math.sqrt(Long.bitCount(maskDirect) - 1) / 5. + ); + + return f * locationFactor; + } + else { + return f; + } + } + private double getAllTermsFactor(SearchResultsKeyword keyword, double totalWeight, int titleLength) { double f = 1.; @@ -264,15 +309,19 @@ public class SearchResultValuator { } - private record SearchResultsKeyword(EdgeSearchResultKeywordScore score, double weight) { + private record SearchResultsKeyword(EdgeSearchResultKeywordScore score, EdgePageWordMetadata wordMetadata, double weight) { + public SearchResultsKeyword(EdgeSearchResultKeywordScore score, double weight) { + this(score, new EdgePageWordMetadata(score.encodedWordMetadata()), weight); + } + public int tfIdf() { - return score.metadata().tfIdf(); + return wordMetadata.tfIdf(); } public int count() { - return score.metadata().count(); + return wordMetadata.count(); } public EnumSet flags() { - return score().metadata().flags(); + return wordMetadata.flagSet(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java index 55590d1a..a20fc294 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java @@ -6,7 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; import nu.marginalia.wmsa.edge.model.id.EdgeId; import java.io.IOException; @@ -49,13 +49,12 @@ public class FeaturesLoaderTool { } client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), - new DocumentKeywords(IndexBlock.Meta, new String[] { feature.getKeyword() }, new long[] { 0 }) + new EdgePageDocumentsMetadata(EdgePageDocumentsMetadata.defaultValue()), + new DocumentKeywords(new String[] { feature.getKeyword() }, new long[] { 0 }) , 0); }); - } catch (IOException e) { - throw new RuntimeException(e); - } catch (SQLException e) { + } catch (IOException | SQLException e) { throw new RuntimeException(e); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java index dd61ea28..323b1279 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java @@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.tools; import com.google.common.hash.Hashing; import net.agkn.hll.HLL; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; import java.io.IOException; import java.nio.file.Path; @@ -27,7 +27,7 @@ public class IndexJournalDumpTool { } private static void cardinality(Path file) throws IOException { - var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file)); + var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(file)); HLL hyperloglog = new HLL(30, 1); var hashFunction = Hashing.murmur3_128(); @@ -39,9 +39,9 @@ public class IndexJournalDumpTool { } private static void dump(Path file) throws IOException { - var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file)); + var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(file)); for (var entry : reader) { - System.out.printf("%s\t%010d\t%06d:%08d\n", entry.block(), entry.docId(), entry.domainId(), entry.urlId()); + System.out.printf("%s\t%010d\t%06d:%08d\n", entry.docId(), entry.domainId(), entry.urlId()); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java deleted file mode 100644 index 05c67481..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java +++ /dev/null @@ -1,202 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - -import com.google.inject.Inject; -import gnu.trove.set.hash.TIntHashSet; -import lombok.SneakyThrows; -import nu.marginalia.util.ranking.RankingDomainFetcher; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; -import nu.marginalia.wmsa.edge.index.model.RankingSettings; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexDao; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import org.mariadb.jdbc.Driver; -import org.roaringbitmap.longlong.Roaring64Bitmap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.util.Objects; - -public class IndexMergerMain { - private static final int CHUNK_HEADER_SIZE = 16; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final SearchIndexPartitioner partitioner; - private final TIntHashSet spamDomains; - - @SneakyThrows - public static long wordCount(File inputFile) { - try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) { - raf.readLong(); - return raf.readInt(); - } - } - - public static void main(String... args) { - Driver driver = new Driver(); - - File file1 = new File(args[0]); - File file2 = new File(args[1]); - File outputFile = new File(args[2]); - - if (!file1.exists()) { - System.err.println("File " + file1 + " does not exist"); - return; - } - if (!file2.exists()) { - System.err.println("File " + file2 + " does not exist"); - return; - } - - if (outputFile.exists()) { // Footgun prevention - System.err.println("File " + outputFile + " already exists"); - return; - } - - var hikari = new DatabaseModule().provideConnection(); - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, domains, new RankingSettings())); - var blacklist = new EdgeDomainBlacklistImpl(hikari); - - new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist); - } - - - @SneakyThrows - @Inject - public IndexMergerMain(File inputFile1, File inputFile2, - File outputFile, - SearchIndexPartitioner partitioner, - EdgeDomainBlacklist blacklist) - { - this.partitioner = partitioner; - this.spamDomains = blacklist.getSpamDomains(); - - if (outputFile.exists()) { - Files.deleteIfExists(Objects.requireNonNull(outputFile).toPath()); - } - - Roaring64Bitmap secondFileIndices = findIndices(inputFile2); - - RandomAccessFile randomAccessFile = new RandomAccessFile(outputFile, "rw"); - randomAccessFile.seek(12); - - FileChannel outputFileChannel = randomAccessFile.getChannel(); - - int wc1 = copyToOutputFile(inputFile2, outputFileChannel, secondFileIndices, true); - int wc2 = copyToOutputFile(inputFile1, outputFileChannel, secondFileIndices, false); - - long pos = randomAccessFile.getFilePointer(); - - randomAccessFile.seek(0); - randomAccessFile.writeLong(pos); - randomAccessFile.writeInt(Math.max(wc1, wc2)); - outputFileChannel.force(true); - outputFileChannel.close(); - randomAccessFile.close(); - } - - private Roaring64Bitmap findIndices(File file) throws IOException { - Roaring64Bitmap ret = new Roaring64Bitmap(); - - logger.info("Mapping indices in {}", file); - - try (final RandomAccessFile raf = new RandomAccessFile(file, "r"); var channel = raf.getChannel()) { - - var fileLength = raf.readLong(); - raf.readInt(); - - ByteBuffer inByteBuffer = ByteBuffer.allocateDirect(10_000); - - while (channel.position() < fileLength) { - inByteBuffer.clear(); - inByteBuffer.limit(CHUNK_HEADER_SIZE); - channel.read(inByteBuffer); - inByteBuffer.flip(); - long urlId = inByteBuffer.getLong(); - int chunkBlock = inByteBuffer.getInt(); - int count = inByteBuffer.getInt(); - inByteBuffer.limit(count * 4 + CHUNK_HEADER_SIZE); - channel.read(inByteBuffer); - - ret.add(encodeId(urlId, chunkBlock)); - } - } - - logger.info("Cardinality = {}", ret.getLongCardinality()); - - return ret; - } - - private int copyToOutputFile(File inFile, FileChannel outFile, Roaring64Bitmap urlIdAndBlock, boolean ifInSet) throws IOException { - int wordCount = 0; - - logger.info("Copying from {}", inFile); - long skippedWrongFile = 0; - long skippedBadUrl = 0; - try (final RandomAccessFile raf = new RandomAccessFile(inFile, "r"); var channel = raf.getChannel()) { - - var fileLength = raf.readLong(); - raf.readInt(); - - ByteBuffer inByteBuffer = ByteBuffer.allocateDirect(10_000); - - while (channel.position() < fileLength) { - inByteBuffer.clear(); - inByteBuffer.limit(CHUNK_HEADER_SIZE); - channel.read(inByteBuffer); - inByteBuffer.flip(); - long urlId = inByteBuffer.getLong(); - int chunkBlock = inByteBuffer.getInt(); - int count = inByteBuffer.getInt(); - inByteBuffer.limit(count*4+CHUNK_HEADER_SIZE); - channel.read(inByteBuffer); - inByteBuffer.position(CHUNK_HEADER_SIZE); - - for (int i = 0; i < count; i++) { - wordCount = Math.max(wordCount, 1+inByteBuffer.getInt()); - } - - inByteBuffer.position(count*4+CHUNK_HEADER_SIZE); - - if (urlIdAndBlock.contains(encodeId(urlId, chunkBlock)) == ifInSet) { - if (isUrlAllowed(urlId)) { - inByteBuffer.flip(); - - while (inByteBuffer.position() < inByteBuffer.limit()) - outFile.write(inByteBuffer); - } - else { - skippedBadUrl++; - } - } - else { - skippedWrongFile++; - } - } - - } - - logger.info("Skipped {}, {}", skippedBadUrl, skippedWrongFile); - return wordCount; - } - - private long encodeId(long urlId, int chunkBlock) { - return ((urlId & 0xFFFF_FFFFL) << 4L) | chunkBlock; - } - - private boolean isUrlAllowed(long url) { - int urlId = (int)(url & 0xFFFF_FFFFL); - int domainId = (int)(url >>> 32); - - return partitioner.isGoodUrl(urlId) && !spamDomains.contains(domainId); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java index 846b4751..9cb945e4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/SearchIndexScrubberMain.java @@ -1,6 +1,5 @@ package nu.marginalia.wmsa.edge.tools; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -57,15 +56,6 @@ public class SearchIndexScrubberMain { inByteBuffer.putInt(chunkBlock); inByteBuffer.putInt(count); channel.read(inByteBuffer); - - - if (chunkBlock == IndexBlock.Link.ordinal()) { - for (int i = 0; i < randomAccessFiles.length; i++) { - inByteBuffer.flip(); - fileChannels[i].write(inByteBuffer); - } - } - } long size = randomAccessFiles[0].getFilePointer(); diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb index 5454f364..fa858789 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb @@ -45,9 +45,16 @@ ip:127.0.0.1Search documents hosted at 127.0.0.1 links:example.comSearch documents linking to example.com - q:-5The amount of javascript and modern features is at least 5 (on a scale 0 to 25) - q:5The amount of javascript and modern features is at most 5 (on a scale 0 to 25) - r:5000The domain ranking is at most 5000 (goes up to about 100k) + tld:edu keywordSearch documents with the top level domain edu. + ?tld:edu keywordPrefer but do not require results with the top level domain edu. + This syntax is also possible for links:..., ip:... and site:... + + q>5The amount of javascript and modern features is at least 5 (on a scale 0 to 25) + q<5The amount of javascript and modern features is at most 5 (on a scale 0 to 25) + + year>2005The document was ostensibly published in or after 2005 + year=2005The document was ostensibly published in 2005 + year<2005The document was ostensibly published in or before 2005 format:html5Filter documents using the HTML5 standard. This is typically modern websites. format:xhtmlFilter documents using the XHTML standard diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java index f8554868..cdd23c4f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java @@ -27,7 +27,7 @@ public class TestLanguageModels { return new LanguageModels( languageModelsHome.resolve("ngrams.bin"), - languageModelsHome.resolve("tfreq-generous-emstr.bin"), + languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java index 26d397a8..ae759dcb 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java @@ -3,7 +3,6 @@ package nu.marginalia.util; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import org.junit.jupiter.api.Assertions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -11,10 +10,11 @@ public class TestUtil { private static final int TEST_PORT_BASE = 6000; private static final int TEST_PORT_RANGE = 2000; + private final static Logger logger = LoggerFactory.getLogger(TestUtil.class); + public static int getPort() { return TEST_PORT_BASE + (int)(TEST_PORT_RANGE * Math.random()); } - private final static Logger logger = LoggerFactory.getLogger(TestUtil.class); @SneakyThrows public static HikariDataSource getConnection() { @@ -35,27 +35,4 @@ public class TestUtil { return new HikariDataSource(config); } - @SneakyThrows - public static void evalScript(HikariDataSource hds, String scriptFile) { - - try (var conn = hds.getConnection()) { - - logger.info("Running script {}", scriptFile); - try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile); - var stmt = conn.createStatement()) { - for (String s : new String(scriptStream.readAllBytes()).split("(;|---)")) { - if (!s.isBlank()) { - try { - Assertions.assertTrue(stmt.executeUpdate(s) >= 0); - } catch (Exception ex) { - logger.error("Failed to execute\n{}" + s, ex); - } - - } - } - } - } - } - - } diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TransformListTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/TransformListTest.java new file mode 100644 index 00000000..15b1ccde --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/TransformListTest.java @@ -0,0 +1,120 @@ +package nu.marginalia.util; + +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class TransformListTest { + + @Test + void transformEach() { + + List values = Stream.of(1,2,3,4).collect(Collectors.toList()); + new TransformList<>(values).transformEach(e -> { + int v = e.value; + if (v == 1) e.remove(); + if (v == 2) e.replace(5); + if (v == 4) e.remove(); + }); + + assertEquals(List.of(5,3), values); + } + + @Test + void transformEachPairRemoveReplace() { + List values = Stream.of(1,2,3,4,5,6).collect(Collectors.toList()); + new TransformList<>(values).transformEachPair((a,b) -> { + System.out.println(a.value + ":" + b.value); + int v = a.value; + if (v == 1 || v == 3 || v == 5) { + a.remove(); + b.replace(-b.value); + } + + }); + + assertEquals(List.of(-2, -4, -6), values); + } + + @Test + void transformEachPairRemoveRemove() { + List values = Stream.of(1,2,3,4,5,6).collect(Collectors.toList()); + new TransformList<>(values).transformEachPair((a,b) -> { + System.out.println(a.value + ":" + b.value); + int v = a.value; + if (v == 1 || v == 3 || v == 5) { + a.remove(); + b.remove(); + } + + }); + + assertEquals(List.of(), values); + } + + @Test + void transformEachPairReplaceRemove() { + List values = Stream.of(1,2,3,4,5,6).collect(Collectors.toList()); + new TransformList<>(values).transformEachPair((a,b) -> { + System.out.println(a.value + ":" + b.value); + int v = a.value; + if (v == 1 || v == 3 || v == 5) { + a.replace(-a.value); + b.remove(); + } + + }); + + assertEquals(List.of(-1, -3, -5), values); + } + + @Test + void transformEachPairReplaceReplace() { + List values = Stream.of(1,2,3,4,5,6).collect(Collectors.toList()); + new TransformList<>(values).transformEachPair((a,b) -> { + System.out.println(a.value + ":" + b.value); + int v = a.value; + if (v == 1 || v == 3 || v == 5) { + a.replace(-a.value); + b.replace(-b.value); + } + + }); + + assertEquals(List.of(-1, -2, -3, -4, -5, -6), values); + } + + @Test + void scanAndTransform() { + List values = Stream.of(1,2,3,4,5,6,7,8,9,10).collect(Collectors.toList()); + new TransformList<>(values).scanAndTransform(Integer.valueOf(3)::equals, Integer.valueOf(7)::equals, entity -> { + entity.replace(entity.value * 2); + }); + + assertEquals(List.of(1,2,6,8,10,12,14,8,9,10), values); + } + + @Test + void scanAndTransformEndsAtEnd() { + List values = Stream.of(1,2,3,4,5,6,7,8,9,10).collect(Collectors.toList()); + new TransformList<>(values).scanAndTransform(Integer.valueOf(3)::equals, Integer.valueOf(10)::equals, entity -> { + entity.replace(entity.value * 2); + }); + + assertEquals(List.of(1,2,6,8,10,12,14,16,18,20), values); + } + + @Test + void scanAndTransformOverlap() { + List values = Stream.of(1,2,3,3,5,7,7,8,9,10).collect(Collectors.toList()); + new TransformList<>(values).scanAndTransform(Integer.valueOf(3)::equals, Integer.valueOf(7)::equals, entity -> { + entity.replace(entity.value * 2); + }); + + assertEquals(List.of(1, 2, 6, 6, 10, 14, 7, 8, 9, 10), values); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/IntLowBitPartitioningSchemeTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/array/IntLowBitPartitioningSchemeTest.java new file mode 100644 index 00000000..25c42338 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/array/IntLowBitPartitioningSchemeTest.java @@ -0,0 +1,21 @@ +package nu.marginalia.util.array; + +import nu.marginalia.util.array.scheme.SequentialPartitioningScheme; +import org.junit.jupiter.api.Test; + +class IntLowBitPartitioningSchemeTest { + + @Test + public void testLBPT() { + var p = new SequentialPartitioningScheme(18); + + System.out.println(p.getRequiredPageSize(0, 51)); + System.out.println(p.getRequiredPageSize(1, 51)); + System.out.println(p.getRequiredPageSize(2, 51)); + System.out.println(p.getRequiredPageSize(3, 51)); + + for (int i = 0; i < 100; i++) { + System.out.println(p.getPage(i) + ":" + p.getOffset(i)); + } + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/PagingIntArrayTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/array/PagingIntArrayTest.java new file mode 100644 index 00000000..e6acfac8 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/array/PagingIntArrayTest.java @@ -0,0 +1,94 @@ +package nu.marginalia.util.array; + +import nu.marginalia.util.array.page.PagingIntArray; +import nu.marginalia.util.array.page.PagingLongArray; +import nu.marginalia.util.array.scheme.SequentialPartitioningScheme; +import nu.marginalia.util.test.TestUtil; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class PagingIntArrayTest { + Path tempDir; + + @BeforeEach + public void setUp() throws IOException { + tempDir = Files.createTempDirectory(getClass().getSimpleName()); + } + + @AfterEach + public void tearDown() { + TestUtil.clearTempDir(tempDir); + } + + @Test + public void testReadLoad() throws IOException { + SequentialPartitioningScheme partitioningScheme = new SequentialPartitioningScheme(7); + Path file = Files.createTempFile(tempDir, "test", "dat"); + + var heapArray = PagingIntArray.newOnHeap(partitioningScheme, 51); + for (int i = 0; i < 51; i++) { + heapArray.set(i, 2 * i); + } + heapArray.write(file); + + + var diskArray = PagingIntArray.mapFileReadOnly(partitioningScheme, file); + for (int i = 0; i < 51; i++) { + assertEquals(2 * i, diskArray.get(i)); + } + + } + + @Test + public void testReadLoadLong() throws IOException { + SequentialPartitioningScheme partitioningScheme = new SequentialPartitioningScheme(7); + Path file = Files.createTempFile(tempDir, "test", "dat"); + + var heapArray = PagingLongArray.newOnHeap(partitioningScheme, 51); + for (int i = 0; i < 51; i++) { + heapArray.set(i, 2 * i); + } + heapArray.write(file); + + + var diskArray = PagingLongArray.mapFileReadOnly(partitioningScheme, file); + for (int i = 0; i < 51; i++) { + assertEquals(2 * i, diskArray.get(i)); + } + } + + @Test + public void testReadFromFileChannel() throws IOException { + SequentialPartitioningScheme partitioningScheme = new SequentialPartitioningScheme(7); + Path file = Files.createTempFile(tempDir, "test", "dat"); + + var heapArray = PagingLongArray.newOnHeap(partitioningScheme, 51); + for (int i = 0; i < 51; i++) { + heapArray.set(i, 2 * i); + } + heapArray.write(file); + + try (var channel = (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ)) { + + var heapArray2 = PagingLongArray.newOnHeap(partitioningScheme, 51); + heapArray2.transferFrom(channel, 10, 7, 20); + + var heapArray3 = PagingLongArray.newPartitionedOnHeap(partitioningScheme, 51); + heapArray3.transferFrom(channel, 10, 7, 20); + + for (int i = 0; i < 51; i++) { + System.out.println(i + ":" + heapArray2.get(i)); + assertEquals(heapArray3.get(i), heapArray2.get(i)); + } + } + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySearchTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySearchTest.java new file mode 100644 index 00000000..c000c16f --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySearchTest.java @@ -0,0 +1,131 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.IntArray; +import nu.marginalia.util.array.buffer.IntQueryBuffer; +import nu.marginalia.util.array.page.PagingIntArray; +import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class IntArraySearchTest { + + IntArray basicArray = IntArray.allocate(1024); + IntArray pagingArray = PagingIntArray.newOnHeap(new PowerOf2PartitioningScheme(64), 1024); + + IntArray shiftedArray = IntArray.allocate(1054).range(30, 1054); + + @BeforeEach + public void setUp() { + for (int i = 0; i < basicArray.size(); i++) { + basicArray.set(i, 3*i); + pagingArray.set(i, 3*i); + shiftedArray.set(i, 3*i); + } + } + + @Test + void linearSearch() { + linearSearchTester(basicArray); + linearSearchTester(pagingArray); + linearSearchTester(shiftedArray); + } + + @Test + void binarySearch() { + binarySearchTester(basicArray); + binarySearchTester(pagingArray); + binarySearchTester(shiftedArray); + } + + @Test + void binarySearchUpperbound() { + binarySearchUpperBoundTester(basicArray); + binarySearchUpperBoundTester(pagingArray); + binarySearchUpperBoundTester(shiftedArray); + } + + void linearSearchTester(IntArray array) { + for (int i = 0; i < array.size() * 3; i++) { + long ret = array.linearSearch(i, 0, array.size()); + + if ((i % 3) == 0) { + assertTrue(ret >= 0); + assertEquals(i, array.get(ret)); + } + else { + long higher = LongArraySearch.decodeSearchMiss(ret); + if (i > 0 && higher < array.size()) { + assertTrue(array.get(higher) < i); + } + } + } + } + + void binarySearchTester(IntArray array) { + for (int i = 0; i < array.size() * 3; i++) { + long ret = array.binarySearch(i, 0, array.size()); + + if ((i % 3) == 0) { + assertTrue(ret >= 0); + assertEquals(i, array.get(ret)); + } + else { + long higher = LongArraySearch.decodeSearchMiss(ret); + if (i > 0 && higher+1 < array.size()) { + assertTrue(array.get(higher) < i); + } + } + } + } + + void binarySearchUpperBoundTester(IntArray array) { + for (int i = 0; i < array.size() * 3; i++) { + long ret = array.binarySearchUpperBound(i, 0, array.size()); + + if ((i % 3) == 0) { + assertTrue(ret >= 0); + assertEquals(i, array.get(ret)); + } + else { + if (i > 0 && ret > 0 && ret < array.size()) { + assertTrue(array.get(ret-1) < i); + } + } + } + } + + @Test + void retain() { + int[] vals = new int[128]; + for (int i = 0; i < vals.length; i++) { vals[i] = i; } + var buffer = new IntQueryBuffer(vals, 128); + + basicArray.retain(buffer, 128, 0, basicArray.size()); + buffer.finalizeFiltering(); + + assertEquals(43, buffer.size()); + for (int i = 0; i < 43; i++) { + assertEquals(buffer.data[i], i*3); + } + } + + @Test + void reject() { + int[] vals = new int[128]; + for (int i = 0; i < vals.length; i++) { vals[i] = i; } + var buffer = new IntQueryBuffer(vals, 128); + + basicArray.reject(buffer, 128, 0, basicArray.size()); + buffer.finalizeFiltering(); + + assertEquals(128-43, buffer.size()); + int j = 0; + for (int i = 0; i < 43; i++) { + if (++j % 3 == 0) j++; + assertEquals(buffer.data[i], j); + } + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySortTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySortTest.java new file mode 100644 index 00000000..6bcda5b2 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArraySortTest.java @@ -0,0 +1,152 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.IntArray; +import nu.marginalia.util.array.page.IntArrayPage; +import nu.marginalia.util.array.page.PagingIntArray; +import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import nu.marginalia.util.test.TestUtil; +import org.apache.commons.lang3.ArrayUtils; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +@Tag("slow") +class IntArraySortTest { + + IntArray basic; + IntArray paged; + IntArray shifted; + + final int size = 1026; + + @BeforeEach + public void setUp() { + basic = IntArrayPage.onHeap(size); + paged = PagingIntArray.newOnHeap(new PowerOf2PartitioningScheme(32), size); + shifted = IntArrayPage.onHeap(size + 30).shifted(30); + + var random = new Random(); + int[] values = new int[size]; + for (int i = 0; i < size; i++) { + values[i] = random.nextInt(0, 1000); + } + + basic.transformEach(0, size, (i, old) -> values[(int) i]); + paged.transformEach(0, size, (i, old) -> values[(int) i]); + shifted.transformEach(0, size, (i, old) -> values[(int) i]); + } + + interface SortOperation { + void sort(IntArray array, long start, long end) throws IOException; + } + + @Test + public void quickSortStressTest() throws IOException { + IntArray array = IntArray.allocate(65536); + sortAlgorithmTester(array, IntArraySort::quickSort); + } + + + @Test + public void insertionSortStressTest() throws IOException { + IntArray array = IntArray.allocate(8192); + sortAlgorithmTester(array, IntArraySort::insertionSort); + } + + @Test + public void mergeSortStressTest() throws IOException { + IntArray array = IntArray.allocate(65536); + Path tempDir = Files.createTempDirectory(getClass().getSimpleName()); + sortAlgorithmTester(array, (a, s, e) -> a.mergeSort(s, e, tempDir)); + TestUtil.clearTempDir(tempDir); + } + + void sortAlgorithmTester(IntArray array, SortOperation operation) throws IOException { + + int[] values = new int[(int) array.size()]; + for (int i = 0; i < values.length; i++) { + values[i] = i; + } + ArrayUtils.shuffle(values); + + int sentinelA = 0xFEEDBEEF; + int sentinelB = 0xB000B000; + + int start = 6; + for (int end = start + 1; end < values.length - 1; end+=97) { + + // Use sentinel values to catch if the sort algorithm overwrites end values + array.set(start - 1, sentinelA); + array.set(end, sentinelB); + + long orderInvariantChecksum = 0; + for (long i = 0; i < end - start; i++) { + array.set(start + i, values[start + (int)i]); + + // Try to checksum the contents to catch bugs where the result is sorted + // but a value has been duplicated, overwriting another + orderInvariantChecksum ^= values[start + (int)i]; + } + + operation.sort(array, start, end); + + assertTrue(array.isSorted(start, end), "Array wasn't sorted"); + + assertEquals(sentinelA, array.get(start - 1), "Start position sentinel overwritten"); + assertEquals(sentinelB, array.get(end), "End position sentinel overwritten"); + + long actualChecksum = 0; + for (long i = start; i < end; i++) { + actualChecksum ^= array.get(i); + } + + assertEquals(orderInvariantChecksum, actualChecksum, "Checksum validation failed"); + } + + } + + @Test + void insertionSort() { + basic.insertionSort(0, size); + Assertions.assertTrue(basic.isSorted(0, 128)); + + paged.insertionSort(0, size); + Assertions.assertTrue(paged.isSorted(0, 128)); + + shifted.insertionSort(0, size); + Assertions.assertTrue(shifted.isSorted(0, 128)); + } + + @Test + void quickSort() { + basic.quickSort(0, size); + Assertions.assertTrue(basic.isSorted(0, size)); + + paged.quickSort(0, size); + Assertions.assertTrue(paged.isSorted(0, size)); + + shifted.quickSort(0, size); + Assertions.assertTrue(shifted.isSorted(0, 128)); + } + + @Test + void mergeSort() throws IOException { + basic.mergeSort(0, size, Path.of("/tmp")); + Assertions.assertTrue(basic.isSorted(0, size)); + + paged.mergeSort(0, size, Path.of("/tmp")); + Assertions.assertTrue(paged.isSorted(0, size)); + + shifted.mergeSort(0, size, Path.of("/tmp")); + Assertions.assertTrue(shifted.isSorted(0, 128)); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArrayTransformationsTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArrayTransformationsTest.java new file mode 100644 index 00000000..8b9b9773 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/IntArrayTransformationsTest.java @@ -0,0 +1,80 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.IntArray; +import nu.marginalia.util.array.page.IntArrayPage; +import nu.marginalia.util.array.page.PagingIntArray; +import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class IntArrayTransformationsTest { + IntArray basic; + IntArray paged; + IntArray shifted; + + final int size = 1026; + + @BeforeEach + public void setUp() { + basic = IntArrayPage.onHeap(size); + paged = PagingIntArray.newOnHeap(new PowerOf2PartitioningScheme(32), size); + shifted = IntArrayPage.onHeap(size + 30).shifted(30); + + for (int i = 0; i < basic.size(); i++) { + basic.set(i, 3*i); + paged.set(i, 3*i); + shifted.set(i, 3*i); + } + } + + @Test + void transformEach() { + transformTester(basic); + transformTester(paged); + transformTester(shifted); + } + + @Test + void transformEachIO() throws IOException { + transformTesterIO(basic); + transformTesterIO(paged); + transformTesterIO(shifted); + } + + @Test + void foldIO() throws IOException { + assertEquals(3*(5+6+7+8+9), basic.foldIO(0, 5, 10, Integer::sum)); + assertEquals(3*(5+6+7+8+9), paged.foldIO(0, 5, 10, Integer::sum)); + assertEquals(3*(5+6+7+8+9), shifted.foldIO(0, 5, 10, Integer::sum)); + } + + private void transformTester(IntArray array) { + array.transformEach(5, 15, (i, o) -> (int) (i - o)); + for (int i = 0; i < 5; i++) { + assertEquals(3*i, array.get(i)); + } + for (int i = 5; i < 15; i++) { + assertEquals(-2*i, array.get(i)); + } + for (int i = 15; i < 20; i++) { + assertEquals(3*i, array.get(i)); + } + } + + private void transformTesterIO(IntArray array) throws IOException { + array.transformEachIO(5, 15, (i, o) -> (int) (i - o)); + for (int i = 0; i < 5; i++) { + assertEquals(3*i, array.get(i)); + } + for (int i = 5; i < 15; i++) { + assertEquals(-2*i, array.get(i)); + } + for (int i = 15; i < 20; i++) { + assertEquals(3*i, array.get(i)); + } + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySearchTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySearchTest.java new file mode 100644 index 00000000..f3cdae16 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySearchTest.java @@ -0,0 +1,157 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.array.page.PagingLongArray; +import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class LongArraySearchTest { + + LongArray basicArray = LongArray.allocate(1024); + LongArray pagingArray = PagingLongArray.newOnHeap(new PowerOf2PartitioningScheme(64), 1024); + + LongArray shiftedArray = LongArray.allocate(1054).range(30, 1054); + + @BeforeEach + public void setUp() { + for (int i = 0; i < basicArray.size(); i++) { + basicArray.set(i, 3L*i); + pagingArray.set(i, 3L*i); + shiftedArray.set(i, 3L*i); + } + } + + @Test + void linearSearch() { + linearSearchTester(basicArray); + linearSearchTester(pagingArray); + linearSearchTester(shiftedArray); + } + + @Test + void binarySearch() { + binarySearchTester(basicArray); + binarySearchTester(pagingArray); + binarySearchTester(shiftedArray); + } + + @Test + void binarySearchUpperBound() { + binarySearchUpperBoundTester(basicArray); + binarySearchUpperBoundTester(pagingArray); + binarySearchUpperBoundTester(shiftedArray); + } + + @Test + void linearSearchUpperBound() { + linearSearchUpperBoundTester(basicArray); + linearSearchUpperBoundTester(pagingArray); + linearSearchUpperBoundTester(shiftedArray); + } + + void linearSearchTester(LongArray array) { + for (int i = 0; i < array.size() * 3; i++) { + long ret = array.linearSearch(i, 0, array.size()); + + if ((i % 3) == 0) { + assertTrue(ret >= 0); + assertEquals(i, array.get(ret)); + } + else { + long higher = LongArraySearch.decodeSearchMiss(ret); + if (i > 0 && higher < array.size()) { + assertTrue(array.get(higher) < i); + } + } + } + } + + void binarySearchTester(LongArray array) { + for (int i = 0; i < array.size() * 3; i++) { + long ret = array.binarySearch(i, 0, array.size()); + + if ((i % 3) == 0) { + assertTrue(ret >= 0); + assertEquals(i, array.get(ret)); + } + else { + long higher = LongArraySearch.decodeSearchMiss(ret); + if (i > 0 && higher+1 < array.size()) { + assertTrue(array.get(higher) < i); + } + } + } + } + + void binarySearchUpperBoundTester(LongArray array) { + for (int i = 0; i < array.size() * 3; i++) { + long ret = array.binarySearchUpperBound(i, 0, array.size()); + + if ((i % 3) == 0) { + assertTrue(ret >= 0); + assertEquals(i, array.get(ret)); + } + else { + if (i > 0 && ret > 0 && ret < array.size()) { + assertTrue(array.get(ret-1) < i); + } + } + } + } + void linearSearchUpperBoundTester(LongArray array) { + for (int i = 0; i < array.size() * 3; i++) { + long ret = array.linearSearchUpperBound(i, 0, array.size()); + long ret2 = array.binarySearchUpperBound(i, 0, array.size()); + + assertEquals(ret, ret2); + + if ((i % 3) == 0) { + assertTrue(ret >= 0); + assertEquals(i, array.get(ret)); + } + else { + if (i > 0 && ret > 0 && ret < array.size()) { + System.out.println(ret); + assertTrue(array.get(ret-1) < i); + } + } + } + } + + @Test + void retain() { + long[] vals = new long[128]; + for (int i = 0; i < vals.length; i++) { vals[i] = i; } + var buffer = new LongQueryBuffer(vals, 128); + + basicArray.retain(buffer, 128, 0, basicArray.size()); + buffer.finalizeFiltering(); + + assertEquals(43, buffer.size()); + for (int i = 0; i < 43; i++) { + assertEquals(buffer.data[i], i*3); + } + } + + @Test + void reject() { + long[] vals = new long[128]; + for (int i = 0; i < vals.length; i++) { vals[i] = i; } + var buffer = new LongQueryBuffer(vals, 128); + + basicArray.reject(buffer, 128, 0, basicArray.size()); + buffer.finalizeFiltering(); + + assertEquals(128-43, buffer.size()); + int j = 0; + for (int i = 0; i < 43; i++) { + if (++j % 3 == 0) j++; + assertEquals(buffer.data[i], j); + } + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySortTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySortTest.java new file mode 100644 index 00000000..e417020f --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArraySortTest.java @@ -0,0 +1,187 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.page.LongArrayPage; +import nu.marginalia.util.array.page.PagingLongArray; +import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import nu.marginalia.util.test.TestUtil; +import org.apache.commons.lang3.ArrayUtils; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +@Tag("slow") +class LongArraySortTest { + + LongArray basic; + LongArray paged; + LongArray shifted; + final int size = 1026; + + @BeforeEach + public void setUp() { + basic = LongArrayPage.onHeap(size); + paged = PagingLongArray.newOnHeap(new PowerOf2PartitioningScheme(32), size); + shifted = LongArrayPage.onHeap(size + 30).shifted(30); + + var random = new Random(); + long[] values = new long[size]; + for (int i = 0; i < size; i++) { + values[i] = random.nextInt(0, 1000); + } + + basic.transformEach(0, size, (i, old) -> values[(int) i]); + paged.transformEach(0, size, (i, old) -> values[(int) i]); + shifted.transformEach(0, size, (i, old) -> values[(int) i]); + } + + interface SortOperation { + void sort(LongArray array, long start, long end) throws IOException; + } + + @Test + public void quickSortStressTest() throws IOException { + LongArray array = LongArray.allocate(65536); + sortAlgorithmTester(array, LongArraySort::quickSort); + } + + + @Test + public void insertionSortStressTest() throws IOException { + LongArray array = LongArray.allocate(8192); + sortAlgorithmTester(array, LongArraySort::insertionSort); + } + + @Test + public void mergeSortStressTest() throws IOException { + LongArray array = LongArray.allocate(65536); + Path tempDir = Files.createTempDirectory(getClass().getSimpleName()); + sortAlgorithmTester(array, (a, s, e) -> a.mergeSort(s, e, tempDir)); + TestUtil.clearTempDir(tempDir); + } + + void sortAlgorithmTester(LongArray array, SortOperation operation) throws IOException { + + long[] values = new long[(int) array.size()]; + for (int i = 0; i < values.length; i++) { + values[i] = i; + } + ArrayUtils.shuffle(values); + + long sentinelA = 0xFEEDBEEFL; + long sentinelB = 0xB000B000; + + int start = 6; + for (int end = start + 1; end < values.length - 1; end+=97) { + + // Use sentinel values to catch if the sort algorithm overwrites end values + array.set(start - 1, sentinelA); + array.set(end, sentinelB); + + long orderInvariantChecksum = 0; + for (long i = 0; i < end - start; i++) { + array.set(start + i, values[start + (int)i]); + + // Try to checksum the contents to catch bugs where the result is sorted + // but a value has been duplicated, overwriting another + orderInvariantChecksum ^= values[start + (int)i]; + } + + operation.sort(array, start, end); + + assertTrue(array.isSorted(start, end), "Array wasn't sorted"); + + assertEquals(sentinelA, array.get(start - 1), "Start position sentinel overwritten"); + assertEquals(sentinelB, array.get(end), "End position sentinel overwritten"); + + long actualChecksum = 0; + for (long i = start; i < end; i++) { + actualChecksum ^= array.get(i); + } + + assertEquals(orderInvariantChecksum, actualChecksum, "Checksum validation failed"); + } + + } + + @Test + void insertionSort() { + basic.insertionSort(0, size); + Assertions.assertTrue(basic.isSorted(0, 128)); + + paged.insertionSort(0, size); + Assertions.assertTrue(paged.isSorted(0, 128)); + + shifted.insertionSort(0, size); + Assertions.assertTrue(shifted.isSorted(0, 128)); + } + + @Test + void insertionSortN() { + basic.insertionSortN(2, 0, size); + Assertions.assertTrue(basic.isSortedN(2, 0, size)); + + paged.insertionSortN(2, 0, size); + Assertions.assertTrue(paged.isSortedN(2, 0, size)); + + shifted.insertionSortN(2, 0, size); + Assertions.assertTrue(shifted.isSortedN(2, 0, size)); + } + + @Test + void quickSort() { + basic.quickSort(0, size); + Assertions.assertTrue(basic.isSorted(0, size)); + + paged.quickSort(0, size); + Assertions.assertTrue(paged.isSorted(0, size)); + + shifted.quickSort(0, size); + Assertions.assertTrue(shifted.isSorted(0, size)); + } + + @Test + void quickSortN() { + basic.quickSortN(2, 0, size); + Assertions.assertTrue(basic.isSortedN(2, 0, size)); + + paged.quickSortN(2, 0, size); + Assertions.assertTrue(paged.isSortedN(2, 0, size)); + + shifted.quickSortN(2, 0, size); + Assertions.assertTrue(shifted.isSortedN(2, 0, size)); + } + + @Test + void mergeSortN() throws IOException { + basic.mergeSortN(2, 0, size, Path.of("/tmp")); + Assertions.assertTrue(basic.isSortedN(2, 0, size)); + + paged.mergeSortN(2, 0, size, Path.of("/tmp")); + Assertions.assertTrue(paged.isSortedN(2, 0, size)); + + shifted.mergeSortN(2, 0, size, Path.of("/tmp")); + Assertions.assertTrue(shifted.isSortedN(2, 0, size)); + } + + @Test + void mergeSort() throws IOException { + basic.mergeSort(0, size, Path.of("/tmp")); + Assertions.assertTrue(basic.isSorted(0, size)); + + paged.mergeSort(0, size, Path.of("/tmp")); + Assertions.assertTrue(paged.isSorted(0, size)); + + shifted.mergeSort(0, size, Path.of("/tmp")); + Assertions.assertTrue(shifted.isSorted(0, size)); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArrayTransformationsTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArrayTransformationsTest.java new file mode 100644 index 00000000..47f33bb3 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/array/algo/LongArrayTransformationsTest.java @@ -0,0 +1,80 @@ +package nu.marginalia.util.array.algo; + +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.page.LongArrayPage; +import nu.marginalia.util.array.page.PagingLongArray; +import nu.marginalia.util.array.scheme.PowerOf2PartitioningScheme; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class LongArrayTransformationsTest { + LongArray basic; + LongArray paged; + LongArray shifted; + + final int size = 1026; + + @BeforeEach + public void setUp() { + basic = LongArrayPage.onHeap(size); + paged = PagingLongArray.newOnHeap(new PowerOf2PartitioningScheme(32), size); + shifted = LongArrayPage.onHeap(size + 30).shifted(30); + + for (int i = 0; i < basic.size(); i++) { + basic.set(i, 3L*i); + paged.set(i, 3L*i); + shifted.set(i, 3L*i); + } + } + + @Test + void transformEach() { + transformTester(basic); + transformTester(paged); + transformTester(shifted); + } + + @Test + void transformEachIO() throws IOException { + transformTesterIO(basic); + transformTesterIO(paged); + transformTesterIO(shifted); + } + + @Test + void foldIO() throws IOException { + assertEquals(3*(5+6+7+8+9), basic.foldIO(0, 5, 10, Long::sum)); + assertEquals(3*(5+6+7+8+9), paged.foldIO(0, 5, 10, Long::sum)); + assertEquals(3*(5+6+7+8+9), shifted.foldIO(0, 5, 10, Long::sum)); + } + + private void transformTester(LongArray array) { + array.transformEach(5, 15, (i, o) -> (int) (i - o)); + for (int i = 0; i < 5; i++) { + assertEquals(3*i, array.get(i)); + } + for (int i = 5; i < 15; i++) { + assertEquals(-2*i, array.get(i)); + } + for (int i = 15; i < 20; i++) { + assertEquals(3*i, array.get(i)); + } + } + + private void transformTesterIO(LongArray array) throws IOException { + array.transformEachIO(5, 15, (i, o) -> (int) (i - o)); + for (int i = 0; i < 5; i++) { + assertEquals(3*i, array.get(i)); + } + for (int i = 5; i < 15; i++) { + assertEquals(-2*i, array.get(i)); + } + for (int i = 15; i < 20; i++) { + assertEquals(3*i, array.get(i)); + } + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/array/scheme/ArrayPartitioningSchemeTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/array/scheme/ArrayPartitioningSchemeTest.java new file mode 100644 index 00000000..6c7cdd40 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/util/array/scheme/ArrayPartitioningSchemeTest.java @@ -0,0 +1,19 @@ +package nu.marginalia.util.array.scheme; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class ArrayPartitioningSchemeTest { + + @Test + public void testPo2() { + var p2 = new PowerOf2PartitioningScheme(64); + var seq = new SequentialPartitioningScheme(64); + + for (int i = 0; i < 512; i++) { + Assertions.assertEquals(p2.getPage(i), seq.getPage(i), "Unexpected buffer @ " + i); + Assertions.assertEquals(p2.getOffset(i), seq.getOffset(i), "Unexpected offset @ " + i); + Assertions.assertEquals(p2.isSamePage(i, i+1), seq.isSamePage(i, i+1), "Unexpected value @ " + i); + } + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java index c55b597d..611d3ddd 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java @@ -1,15 +1,13 @@ package nu.marginalia.util.btree; +import nu.marginalia.util.array.LongArray; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; -import nu.marginalia.util.multimap.MultimapFileLong; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashSet; @@ -21,7 +19,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; class BTreeWriterTest { - final BTreeContext ctx = new BTreeContext(4, 2, 0xFFFF_FFFF_FFFF_FFFFL, 3); + final BTreeContext ctx = new BTreeContext(4, 2, 3); final BTreeWriter writer = new BTreeWriter(null, ctx); Logger logger = LoggerFactory.getLogger(getClass()); @@ -79,33 +77,32 @@ class BTreeWriterTest { var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); Set toPut = new HashSet<>(); - for (int i = 0; i < 500; i++) { + for (int i = 0; i < 64; i++) { while (!toPut.add((int)(Integer.MAX_VALUE * Math.random()))); } int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray(); try { - RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); + LongArray longArray = LongArray.allocate(10000); { - var writer = new BTreeWriter(mmf, ctx); + var writer = new BTreeWriter(longArray, ctx); writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - slice.put(2L*i, data[i]); - slice.put( 2L*i + 1, i); + slice.set(2L*i, data[i]); + slice.set( 2L*i + 1, i); } }); - mmf.force(); } { - var reader = new BTreeReader(mmf, ctx, 0); + var reader = new BTreeReader(longArray, ctx, 0); for (int i = 0; i < data.length; i++) { long offset = reader.findEntry(data[i]); assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(i, mmf.get(offset+1)); + offset += reader.getHeader().dataOffsetLongs(); + assertEquals(i, longArray.get(offset+1)); } } } catch (Exception e) { @@ -126,40 +123,26 @@ class BTreeWriterTest { } int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray(); - - try { - RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); - - { - var writer = new BTreeWriter(mmf, ctx); - writer.write( 0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(2L*i, data[i]); - slice.put(2L*i + 1, i); - } - }); - mmf.force(); + LongArray array = LongArray.allocate(22000); + var writer = new BTreeWriter(array, ctx); + writer.write( 0, toPut.size(), (slice) -> { + for (int i = 0; i < data.length; i++) { + slice.set(2L*i, data[i]); + slice.set(2L*i + 1, i); } + }); + var reader = new BTreeReader(array, ctx, 0); + for (int i = 0; i < data.length; i++) { + long offset = reader.findEntry(data[i]); + assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); + offset += reader.getHeader().dataOffsetLongs(); + assertEquals(array.get(offset+1), i); + } - { - var reader = new BTreeReader(mmf, ctx, 0); - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i]); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(i, mmf.get(offset+1)); - } - - for (int i = 0; i < 500; i++) { - long val = (long)(Long.MAX_VALUE * Math.random()); - while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random()); - assertTrue(reader.findEntry( val) < 0); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); + for (int i = 0; i < 500; i++) { + long val = (long)(Long.MAX_VALUE * Math.random()); + while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random()); + assertTrue(reader.findEntry( val) < 0); } } @@ -170,7 +153,7 @@ class BTreeWriterTest { var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); Set toPut = new HashSet<>(); - var ctx = new BTreeContext(5, 1, ~0, bs); + var ctx = new BTreeContext(5, 1, bs); for (int i = 0; i < 500; i++) { while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; @@ -178,148 +161,31 @@ class BTreeWriterTest { long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); - try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(i, data[i]); - } - }); - mmf.force(); + LongArray array = LongArray.allocate(22000); + var writer = new BTreeWriter(array, ctx); + writer.write(0, toPut.size(), (slice) -> { + for (int i = 0; i < data.length; i++) { + slice.set(i, data[i]); } + }); - { - var reader = new BTreeReader(mmf, ctx, 0); + var reader = new BTreeReader(array, ctx, 0); - printTreeLayout(toPut.size(), reader.getHeader(), ctx); + printTreeLayout(toPut.size(), reader.getHeader(), ctx); - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i]); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(data[i], mmf.get(offset)); - } - - for (int i = 0; i < 500; i++) { - long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertTrue(reader.findEntry( val) < 0); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); + for (int i = 0; i < data.length; i++) { + long offset = reader.findEntry(data[i]); + assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); + offset += reader.getHeader().dataOffsetLongs(); + assertEquals(data[i], array.get(offset)); } - } - } - - @Test - public void testWriteEqualityMasked() throws IOException { - - for (int bs = 2; bs <= 4; bs++) { - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - long mask = 0xFFFF_FFFF_0000_0000L; - var ctx = new BTreeContext(5, 1, mask, bs); for (int i = 0; i < 500; i++) { - while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; - } - - long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); - - try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(i, data[i]); - } - }); - mmf.force(); - } - - { - var reader = new BTreeReader(mmf, ctx, 0); - - printTreeLayout(toPut.size(), reader.getHeader(), ctx); - - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i] & mask); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(data[i], mmf.get(offset)); - } - - for (int i = 0; i < 500; i++) { - long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertTrue(reader.findEntry(val & mask) < 0); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); + long val = (long) (Long.MAX_VALUE * Math.random()); + while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); + assertTrue(reader.findEntry( val) < 0); } } } - @Test - public void testWriteTwoEqualityMasked() throws IOException { - - for (int bs = 2; bs <= 4; bs++) { - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - long mask = 0xFFFF_FFFF_0000_0000L; - var ctx = new BTreeContext(5, 2, mask, bs); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; - } - - long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); - - try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(i*2L, data[i]); - slice.put(i*2L+1, i); - } - }); - mmf.force(); - } - - { - var reader = new BTreeReader(mmf, ctx, 0); - - printTreeLayout(toPut.size(), reader.getHeader(), ctx); - - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i] & mask); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(data[i], mmf.get(offset)); - assertEquals(i, mmf.get(offset+1)); - } - - for (int i = 0; i < 500; i++) { - long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertTrue(reader.findEntry(val & mask) < 0); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - } - - - } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDitherTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDitherTest.java index b9510517..632603bd 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDitherTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/graphics/dithering/FloydSteinbergDitherTest.java @@ -10,10 +10,10 @@ class FloydSteinbergDitherTest { @Test public void test() throws IOException { - convert("/home/vlofgren/Work/dither/volvo.jpg", "/home/vlofgren/Work/dither/volvo-raster.png"); - convert("/home/vlofgren/Work/dither/dog.jpg", "/home/vlofgren/Work/dither/dog-raster.png"); - convert("/home/vlofgren/Work/dither/robocop.jpg", "/home/vlofgren/Work/dither/robocop-raster.png"); - convert("/home/vlofgren/Work/dither/socrates.jpeg", "/home/vlofgren/Work/dither/socrates-raster.png"); +// convert("/home/vlofgren/Work/dither/volvo.jpg", "/home/vlofgren/Work/dither/volvo-raster.png"); +// convert("/home/vlofgren/Work/dither/dog.jpg", "/home/vlofgren/Work/dither/dog-raster.png"); +// convert("/home/vlofgren/Work/dither/robocop.jpg", "/home/vlofgren/Work/dither/robocop-raster.png"); +// convert("/home/vlofgren/Work/dither/socrates.jpeg", "/home/vlofgren/Work/dither/socrates-raster.png"); // convert("C:\\Users\\vlofg\\Documents\\volvo.jpg", diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java deleted file mode 100644 index d2bec272..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java +++ /dev/null @@ -1,58 +0,0 @@ -package nu.marginalia.util.hash; - -import nu.marginalia.util.multimap.MultimapFileLong; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.HashSet; -import java.util.Set; - -class LongPairHashMapTest { - - @Test - public void test() throws IOException { - - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((int)(Integer.MAX_VALUE * Math.random()))); - } - - try { - RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); - var lphm = LongPairHashMap.createNew(mmf, 1024); - toPut.forEach(i -> { - lphm.put(new LongPairHashMap.CellData(i, i)); - }); - mmf.force(); - lphm.close(); - - RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000); - var lphm2 = LongPairHashMap.loadExisting(mmf2); - toPut.forEach(i -> { - Assertions.assertTrue(lphm2.get(i).isSet()); - Assertions.assertEquals(i, (int) lphm2.get(i).getKey()); - Assertions.assertEquals(i, (int) lphm2.get(i).getOffset()); - }); - - for (int i = 0; i < 10_000_000; i++) { - if (!toPut.contains(i)) { - Assertions.assertFalse(lphm2.get(i).isSet()); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/test/TestUtil.java b/marginalia_nu/src/test/java/nu/marginalia/util/test/TestUtil.java index b6e9795a..f3a40e4c 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/test/TestUtil.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/test/TestUtil.java @@ -1,6 +1,9 @@ package nu.marginalia.util.test; +import lombok.SneakyThrows; + import java.io.File; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; @@ -9,6 +12,7 @@ public class TestUtil { private static boolean isTempDir(Path dir) { return dir.startsWith("/tmp") || dir.toString().contains("tmp"); } + @SneakyThrows public static void clearTempDir(Path dir) { if (!isTempDir(dir)) { throw new IllegalArgumentException("Refusing to recursively delete directory with that name"); @@ -19,11 +23,24 @@ public class TestUtil { if (files != null) { Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); } - System.out.println("Deleting " + f); + System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); f.delete(); } } System.out.println("Deleting " + dir); dir.toFile().delete(); } + + private static String fileSize(Path path) throws IOException { + long sizeBytes = Files.size(path); + + if (sizeBytes > 1024*1024*1024) return round(sizeBytes / 1073741824.) + "Gb"; + if (sizeBytes > 1024*1024) return round(sizeBytes / 1048576.) + "Mb"; + if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; + return sizeBytes + "b"; + } + + private static String round(double d) { + return String.format("%.2f", d); + } } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java deleted file mode 100644 index 124c7826..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java +++ /dev/null @@ -1,79 +0,0 @@ -package nu.marginalia.wmsa.configuration.server; - -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.client.exception.RemoteException; -import nu.marginalia.wmsa.edge.assistant.EdgeAssistantService; -import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; -import nu.marginalia.wmsa.edge.assistant.dict.DictionaryService; -import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; -import nu.marginalia.wmsa.edge.assistant.eval.MathParser; -import nu.marginalia.wmsa.edge.assistant.eval.Units; -import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import spark.Spark; - -import static nu.marginalia.util.TestUtil.getConnection; - -class ServiceTest { - static EdgeAssistantService service; - static AssistantClient client; - - private static HikariDataSource dataSource; - - static final int testPort = TestUtil.getPort(); - - @SneakyThrows - public static HikariDataSource provideConnection() { - return getConnection(); - } - - - @BeforeAll - public static void setUpClass() { - Spark.port(testPort); - System.setProperty("service-name", "test"); - - dataSource = provideConnection(); - dataSource.setKeepaliveTime(100); - dataSource.setIdleTimeout(100); - - client = new AssistantClient(); - client.setServiceRoute("127.0.0.1", testPort); - - service = new EdgeAssistantService("127.0.0.1", - testPort, - new Initialization(), null, - new DictionaryService(dataSource, new SpellChecker()), - new MathParser(), - new Units(new MathParser()), - new ScreenshotService(null, dataSource), null); - - Spark.awaitInitialization(); - } - - @Test - public void testDenyXPublic() { - try { - client.ping(Context.internal().treatAsPublic()).blockingSubscribe(); - Assertions.fail("Expected exception"); - } - catch (RemoteException ex) { - // - } - } - @Test - public void testAllowInternalNoXPublic() { - client.ping(Context.internal()).blockingSubscribe(); - } - - @Test - public void testAllowOnPublic() { - Assertions.assertEquals("EdgeAssistantService", client.who(Context.internal()).blockingFirst()); - Assertions.assertEquals("EdgeAssistantService", client.who(Context.internal().treatAsPublic()).blockingFirst()); - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java deleted file mode 100644 index 88d1232a..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java +++ /dev/null @@ -1,145 +0,0 @@ -package nu.marginalia.wmsa.edge.assistant; - -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.client.exception.RemoteException; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.edge.assistant.client.AssistantClient; -import nu.marginalia.wmsa.edge.assistant.dict.DictionaryService; -import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; -import nu.marginalia.wmsa.edge.assistant.eval.MathParser; -import nu.marginalia.wmsa.edge.assistant.eval.Units; -import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; -import nu.marginalia.wmsa.edge.search.svc.EdgeSearchUnitConversionService; -import org.junit.jupiter.api.*; -import org.junit.jupiter.api.parallel.Execution; -import org.junit.jupiter.api.parallel.ExecutionMode; -import org.junit.jupiter.api.parallel.ResourceAccessMode; -import org.junit.jupiter.api.parallel.ResourceLock; -import spark.Spark; - -import java.util.concurrent.ExecutionException; - -import static nu.marginalia.util.TestUtil.getConnection; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE) -@Execution(ExecutionMode.SAME_THREAD) -@Tag("db") -class AssistantTest { - static EdgeAssistantService service; - static AssistantClient client; - - private static HikariDataSource dataSource; - - static final int testPort = TestUtil.getPort(); - - @SneakyThrows - public static HikariDataSource provideConnection() { - return getConnection(); - } - - - @BeforeAll - public static void setUpClass() { - Spark.port(testPort); - System.setProperty("service-name", "test"); - - dataSource = provideConnection(); - dataSource.setKeepaliveTime(100); - dataSource.setIdleTimeout(100); - - client = new AssistantClient(); - client.setServiceRoute("127.0.0.1", testPort); - - service = new EdgeAssistantService("127.0.0.1", - testPort, - new Initialization(), null, - new DictionaryService(dataSource, new SpellChecker()), - new MathParser(), - new Units(new MathParser()), - new ScreenshotService(null, dataSource), null); - - Spark.awaitInitialization(); - } - - @BeforeEach - public void clearDb() { - } - - @SneakyThrows - @AfterAll - public static void tearDownAll() { - dataSource.close(); - Spark.awaitStop(); - } - - @Test - public void testSpellCheck() { - var result = client.spellCheck(Context.internal(), "plato").blockingFirst(); - System.out.println(result); - } - @Test - public void testDictionary() { - var result = client.dictionaryLookup(Context.internal(), "adiabatic").blockingFirst(); - System.out.println(result); - assertTrue(result.entries.size() > 1); - } - - @Test - public void testDictionaryNoQuery() { - var result = client.dictionaryLookup(Context.internal(), "vlofgren").blockingFirst(); - System.out.println(result); - assertTrue(result.entries.isEmpty()); - } - - @Test - public void testEncyclopediaNoQuery() { - var result = client.dictionaryLookup(Context.internal(), "vlofgren").blockingFirst(); - System.out.println(result); - assertTrue(result.entries.isEmpty()); - } - - @Test - public void testConvertUnitsWithParser() { - var conversion = new EdgeSearchUnitConversionService(client); - assertEquals("0.3 m", conversion.tryConversion(Context.internal(), "30 cm in m").get()); - assertEquals("500 m", conversion.tryConversion(Context.internal(), "0.5 km in m").get()); - assertEquals("500 m", conversion.tryConversion(Context.internal(), "0.1+0.4 km in m").get()); - assertTrue(conversion.tryConversion(Context.internal(), "0.5 km in F").isEmpty()); - assertTrue(conversion.tryConversion(Context.internal(), "plato").isEmpty()); - } - - @Test - public void testConvertUnits() { - assertEquals("5 m", client.unitConversion(Context.internal(), "500", "cm", "meters").blockingFirst()); - } - - @Test - public void testEvalmath() { - assertEquals("300", client.evalMath(Context.internal(), "3*10^2").blockingFirst()); - } - - @Test - public void testEvalWithParser() throws ExecutionException, InterruptedException { - var conversion = new EdgeSearchUnitConversionService(client); - assertEquals("305", conversion.tryEval(Context.internal(), "300+5").get()); - assertEquals("1.772", conversion.tryEval(Context.internal(), "sqrt(pi)").get()); - - } - - - @Test - public void testConvertUnitsWeirdError() { - try { - client.unitConversion(Context.internal(), "500", "kg", "meters").blockingFirst(); - Assertions.fail("Wanted exception"); - } - catch (RemoteException ex) { - - } - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleanerTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleanerTest.java index ccea45ab..2535f206 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleanerTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/dict/WikiCleanerTest.java @@ -6,21 +6,19 @@ import org.openzim.ZIMTypes.ZIMFile; import org.openzim.ZIMTypes.ZIMReader; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; class WikiCleanerTest { @Test void cleanWikiJunk() throws IOException { - String str = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Scamander", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.wiki.html")))); - String str2 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Plato", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.wiki.html")))); - String str3 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/C++", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.wiki.html")))); - String str4 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Memex", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.wiki.html")))); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.out.html"), str); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.out.html"), str2); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.out.html"), str3); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.out.html"), str4); +// String str = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Scamander", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.wiki.html")))); +// String str2 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Plato", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.wiki.html")))); +// String str3 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/C++", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.wiki.html")))); +// String str4 = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/Memex", new String(Files.readAllBytes(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.wiki.html")))); +// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Scamander.out.html"), str); +// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Plato.out.html"), str2); +// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Cpp.out.html"), str3); +// Files.writeString(Path.of("/home/vlofgren/Work/wiki-cleaner/Memex.out.html"), str4); } @Test @Disabled diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java index 2acf9165..7184c8b9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/suggest/SuggestionsTest.java @@ -5,6 +5,7 @@ import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import java.nio.file.Path; @@ -21,6 +22,7 @@ class SuggestionsTest { } @Test + @Disabled void getSuggestions() { System.out.println(tryGetSuggestions("neop")); System.out.println(tryGetSuggestions("neopla")); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java index bc628b13..249bb160 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java @@ -6,11 +6,12 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink; import nu.marginalia.wmsa.edge.model.EdgeDomain; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; - +@Tag("slow") @Testcontainers class SqlLoadDomainLinksTest { @Container diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java index 25dd18b4..c57c5706 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java @@ -2,13 +2,15 @@ package nu.marginalia.wmsa.edge.converting.loader; import nu.marginalia.util.TestUtil; import nu.marginalia.wmsa.edge.model.EdgeDomain; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertTrue; +@Tag("slow") @Testcontainers class SqlLoadDomainsTest { @Container diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java index 0dde33c9..54e4eccb 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java @@ -4,7 +4,7 @@ import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.util.TestUtil; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDataStoreDaoImpl; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; @@ -12,6 +12,7 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; import nu.marginalia.wmsa.edge.model.id.EdgeIdArray; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; @@ -24,6 +25,7 @@ import java.util.Set; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; +@Tag("slow") @Testcontainers class SqlLoadProcessedDocumentTest { @Container diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java index eb66da92..000b0923 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java @@ -7,11 +7,13 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; +@Tag("slow") @Testcontainers class SqlLoadProcessedDomainTest { @Container diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java index 5afac733..84d8d586 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java @@ -6,6 +6,7 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import org.testcontainers.containers.MariaDBContainer; import org.testcontainers.junit.jupiter.Container; @@ -13,6 +14,7 @@ import org.testcontainers.junit.jupiter.Testcontainers; import java.net.URISyntaxException; +@Tag("slow") @Testcontainers class SqlLoadUrlsTest { @Container diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java index f0c5f5fc..dd99d27e 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/PubDateSnifferTest.java @@ -1,5 +1,6 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; +import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateParser; import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.PubDateSniffer; import nu.marginalia.wmsa.edge.converting.processor.logic.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; @@ -11,7 +12,6 @@ import org.junit.jupiter.api.Test; import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Files; -import java.nio.file.Path; import static org.junit.jupiter.api.Assertions.*; @@ -23,7 +23,7 @@ class PubDateSnifferTest { public void testGetYearFromText() { var ret = PubDateParser.dateFromHighestYearLookingSubstring("© 2005-2010 Bob Dobbs"); assertTrue(ret.isPresent()); - assertEquals(2010, ret.get().year()); + assertEquals(2007, ret.get().year()); ret = PubDateParser.dateFromHighestYearLookingSubstring("© 99 Bob Dobbs"); assertFalse(ret.isPresent()); @@ -117,14 +117,14 @@ class PubDateSnifferTest { public void testProblemCases() throws IOException, URISyntaxException { var ret = dateSniffer.getPubDate("", new EdgeUrl("https://www.example.com/"), - Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true); + Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), EdgeHtmlStandard.HTML5, true); assertFalse(ret.isEmpty()); assertEquals(2006, ret.year()); ret = dateSniffer.getPubDate("", new EdgeUrl("https://www.example.com/"), - Jsoup.parse(Files.readString(Path.of("/home/vlofgren/Code/tmp-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true); + Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), EdgeHtmlStandard.XHTML, true); assertFalse(ret.isEmpty()); assertEquals(2010, ret.year()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java index 47929f43..64942b5f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/SummaryExtractorTest.java @@ -1,5 +1,6 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; +import nu.marginalia.wmsa.configuration.WmsaHome; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -42,7 +43,7 @@ class SummaryExtractorTest { } @Test public void testSummaryFilter3() throws IOException { - var data = Path.of("/home/vlofgren/Code/tmp-data/url-327999153"); + var data = WmsaHome.getHomePath().resolve("test-data/url-327999153"); String html = Files.readString(data); var doc = Jsoup.parse(html); var filter = new SummaryExtractionFilter(); @@ -50,9 +51,10 @@ class SummaryExtractorTest { filter.getSummary(255); } + @Test public void testSummaryFilter2() throws IOException { - var data = Path.of("/home/vlofgren/Code/tmp-data/"); + var data = WmsaHome.getHomePath().resolve("test-data/"); System.out.println("Running"); @@ -144,7 +146,7 @@ class SummaryExtractorTest { String index = readClassPathFile("html/work-set/index"); String[] files = index.split("\n"); - Map result = new HashMap(); + Map result = new HashMap<>(); for (String file : files) { Path p = Path.of("html/work-set/").resolve(file); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateTest.java new file mode 100644 index 00000000..d55c3bfc --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/pubdate/PubDateTest.java @@ -0,0 +1,18 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.pubdate; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class PubDateTest { + + @Test + void yearByte() { + for (int year = PubDate.MIN_YEAR; year < 2022; year++) { + var pdInstance = new PubDate(null, year); + assertEquals(year, PubDate.fromYearByte(pdInstance.yearByte())); + } + } + + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java index a59726d6..653294f8 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java @@ -48,13 +48,6 @@ class HttpFetcherTest { System.out.println(str); } - @Test - void resolveRedirectRitEdu() throws URISyntaxException { - var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler"); - var str = fetcher.probe(new EdgeUrl("http://www.rit.edu/cla/philosophy/Suits.html")).blockingFirst(); - System.out.println(str); - } - @Test void resolveRedirect2() throws URISyntaxException { var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler"); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java index b8da0723..08dcef4c 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java @@ -11,11 +11,14 @@ import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; +import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeDomain; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import java.io.IOException; @@ -24,6 +27,7 @@ import java.nio.file.Path; import java.util.*; import java.util.regex.Pattern; +@Tag("slow") class SentenceExtractorTest { SentenceExtractor newSe; SentenceExtractor legacySe; @@ -41,7 +45,7 @@ class SentenceExtractorTest { public static void main(String... args) throws IOException { final LanguageModels lm = TestLanguageModels.getLanguageModels(); - var data = Path.of("/home/vlofgren/Code/tmp-data/"); + var data = WmsaHome.getHomePath().resolve("test-data/"); System.out.println("Running"); @@ -55,7 +59,7 @@ class SentenceExtractorTest { var doc = Jsoup.parse(Files.readString(file.toPath())); long start = System.currentTimeMillis(); var dld = se.extractSentences(doc); - documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata(0)); + documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata()); total += (System.currentTimeMillis() - start); } System.out.println(total); @@ -65,7 +69,7 @@ class SentenceExtractorTest { @SneakyThrows @Test void testExtractSubject() { - var data = Path.of("/home/vlofgren/Code/tmp-data/"); + var data = WmsaHome.getHomePath().resolve("test-data/"); System.out.println("Running"); @@ -118,7 +122,7 @@ class SentenceExtractorTest { var newResult = newSe.extractSentences(Jsoup.parse(post.body)); - var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata(0)); + var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata()); System.out.println(newRes); }); reader.join(); @@ -130,7 +134,7 @@ class SentenceExtractorTest { } @Test void extractSentences() throws IOException { - var data = Path.of("/home/vlofgren/Code/tmp-data/"); + var data = WmsaHome.getHomePath().resolve("test-data/"); System.out.println("Running"); @@ -140,7 +144,7 @@ class SentenceExtractorTest { long st = System.currentTimeMillis(); for (var file : Objects.requireNonNull(data.toFile().listFiles())) { var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); - var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata(0)); + var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata()); System.out.println(newRes); } System.out.println(System.currentTimeMillis() - st); @@ -149,12 +153,13 @@ class SentenceExtractorTest { @SneakyThrows @Test + @Disabled public void testSE() { var result = newSe.extractSentences( Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html")))); var dict = new TermFrequencyDict(lm); - System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new KeywordMetadata(0))); + System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new KeywordMetadata())); // diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java index 2987fde0..2460987a 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java @@ -15,7 +15,7 @@ class UrlBlocklistTest { void isUrlBlocked() throws URISyntaxException { UrlBlocklist blocklist = new UrlBlocklist(); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://memex.marginalia.nu/ghc/ghc/blob/1b1067d14b656bbbfa7c47f156ec2700c9751549/compiler/main/UpdateCafInfos.hs"))); - assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://memex.marginalia.nu//gn/+/d62642c920e6a0d1756316d225a90fd6faa9e21e"))); +// assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://memex.marginalia.nu//gn/+/d62642c920e6a0d1756316d225a90fd6faa9e21e"))); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://www.marginalia.nu/wp-content/uploads/test.jpg"))); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/pdf/download-a-course-in-algebra.html"))); assertFalse(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/nope/x-a-course-in-algebra.html"))); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java new file mode 100644 index 00000000..9c0d9beb --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java @@ -0,0 +1,63 @@ +package nu.marginalia.wmsa.edge.index.model; + + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class EdgePageDocumentsMetadataTest { + + @Test + public void codecYear() { + var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, (byte) 0); + long encoded = meta.encode(); + var decoded = new EdgePageDocumentsMetadata(encoded); + assertEquals(192, decoded.year()); + } + + @Test + public void codecTopology() { + var meta = new EdgePageDocumentsMetadata(0, 192, 0, 0, 0, (byte) 0); + long encoded = meta.encode(); + var decoded = new EdgePageDocumentsMetadata(encoded); + assertEquals(192, decoded.topology()); + } + + @Test + public void codecSets() { + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 14, 0, (byte) 0); + long encoded = meta.encode(); + var decoded = new EdgePageDocumentsMetadata(encoded); + assertEquals(14, decoded.sets()); + } + + @Test + public void codecQuality() { + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 9, (byte) 0); + long encoded = meta.encode(); + var decoded = new EdgePageDocumentsMetadata(encoded); + assertEquals(9, decoded.quality()); + } + + @Test + public void codecFlags() { + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, (byte) 255); + long encoded = meta.encode(); + System.out.println(Long.toHexString(encoded)); + var decoded = new EdgePageDocumentsMetadata(encoded); + System.out.println(decoded); + assertEquals((byte) 255, decoded.flags()); + } + + @Test + public void encSize() { + assertEquals(100, new EdgePageDocumentsMetadata(0).withSize(145).size()); + assertEquals(100, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(145).encode())); + + assertEquals(50, new EdgePageDocumentsMetadata(0).withSize(4).size()); + assertEquals(50, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(4).encode())); + + assertEquals(50*255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode())); + assertEquals(50*255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size()); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java new file mode 100644 index 00000000..a4b97a7a --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java @@ -0,0 +1,122 @@ +package nu.marginalia.wmsa.edge.index.postings.forward; + +import lombok.SneakyThrows; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.test.TestUtil; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; +import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class ForwardIndexConverterTest { + + KeywordLexicon keywordLexicon; + SearchIndexJournalWriterImpl writer; + + Path indexFile; + Path wordsFile1; + Path urlsFile1; + Path dictionaryFile; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + Path dataDir; + private Path wordsFile; + private Path docsFileId; + private Path docsFileData; + + int workSetSize = 512; + @BeforeEach + @SneakyThrows + void setUp() { + dictionaryFile = Files.createTempFile("tmp", ".dict"); + dictionaryFile.toFile().deleteOnExit(); + + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18)); + keywordLexicon.getOrInsert("0"); + + indexFile = Files.createTempFile("tmp", ".idx"); + indexFile.toFile().deleteOnExit(); + writer = new SearchIndexJournalWriterImpl(keywordLexicon, indexFile.toFile()); + + wordsFile1 = Files.createTempFile("words1", ".idx"); + urlsFile1 = Files.createTempFile("urls1", ".idx"); + + dataDir = Files.createTempDirectory(getClass().getSimpleName()); + + for (int i = 1; i < workSetSize; i++) { + createEntry(writer, keywordLexicon, i); + } + + + keywordLexicon.commitToDisk(); + Thread.sleep(1000); + writer.forceWrite(); + + + var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)); + + wordsFile = dataDir.resolve("words.dat"); + docsFileId = dataDir.resolve("docs-i.dat"); + docsFileData = dataDir.resolve("docs-d.dat"); + } + + @AfterEach + public void tearDown() { + TestUtil.clearTempDir(dataDir); + } + + public int[] getFactorsI(int id) { + return IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); + } + + long createId(long url, long domain) { + return (domain << 32) | url; + } + public void createEntry(SearchIndexJournalWriterImpl writer, KeywordLexicon keywordLexicon, int id) { + int[] factors = getFactorsI(id); + var header = new SearchIndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5); + + long[] data = new long[factors.length*2]; + for (int i = 0; i < factors.length; i++) { + data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); + data[2*i + 1] = -factors[i]; + } + + writer.put(header, new SearchIndexJournalEntry(data)); + } + + @Test + void testForwardIndex() throws IOException { + + Path tmpDir = Path.of("/tmp"); + + new ForwardIndexConverter(tmpDir, indexFile.toFile(), docsFileId, docsFileData).convert(); + + var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); + + for (int i = 36; i < workSetSize; i++) { + assertEquals(i % 5, forwardReader.getDocMeta(i)); + assertEquals(i/20, forwardReader.getDomainId(i)); + } + + TestUtil.clearTempDir(dataDir); + } + + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java new file mode 100644 index 00000000..4c210c54 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java @@ -0,0 +1,131 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse; + +import lombok.SneakyThrows; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.test.TestUtil; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; +import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.stream.IntStream; +import java.util.stream.LongStream; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +class ReverseIndexConverterTest { + KeywordLexicon keywordLexicon; + SearchIndexJournalWriterImpl writer; + + Path indexFile; + Path wordsFile1; + Path urlsFile1; + Path dictionaryFile; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @BeforeEach + @SneakyThrows + void setUp() { + dictionaryFile = Files.createTempFile("tmp", ".dict"); + dictionaryFile.toFile().deleteOnExit(); + + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<16)); + keywordLexicon.getOrInsert("0"); + + indexFile = Files.createTempFile("tmp", ".idx"); + indexFile.toFile().deleteOnExit(); + writer = new SearchIndexJournalWriterImpl(keywordLexicon, indexFile.toFile()); + + wordsFile1 = Files.createTempFile("words1", ".idx"); + urlsFile1 = Files.createTempFile("urls1", ".idx"); + } + + public void createEntry(SearchIndexJournalWriterImpl writer, KeywordLexicon keywordLexicon, int id) { + int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); + var header = new SearchIndexJournalEntryHeader(factors.length, id, EdgePageDocumentsMetadata.defaultValue()); + + long[] data = new long[factors.length*2]; + for (int i = 0; i < factors.length; i++) { + data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); + data[2*i + 1] = factors[i]; + } + + writer.put(header, new SearchIndexJournalEntry(data)); + } + + @Test + void testReverseIndex() throws IOException, InterruptedException { + for (int i = 1; i < 512; i++) { + createEntry(writer, keywordLexicon, i); + } + + + keywordLexicon.commitToDisk(); + Thread.sleep(1000); + writer.forceWrite(); + + + Path tmpDir = Path.of("/tmp"); + Path dataDir = Files.createTempDirectory(getClass().getSimpleName()); + + var wordsFile = dataDir.resolve("urls.dat"); + var docsFile = dataDir.resolve("docs.dat"); + var journalReader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)); + + new ReverseIndexConverter(tmpDir, journalReader, wordsFile, docsFile) + .convert(); + + var reverseIndexReader = new ReverseIndexReader(wordsFile, docsFile); + + System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("1"))); + System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("2"))); + System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("3"))); + + System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("1"), 1)); + System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 1)); + System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("1"), 2)); + System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 2)); + System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("1"), 3)); + System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 3)); + + var buffer = new LongQueryBuffer(32); + reverseIndexReader.documents(keywordLexicon.getReadOnly("1"), ReverseIndexEntrySourceBehavior.DO_PREFER).read(buffer); + assertArrayEquals(LongStream.range(1, 17).toArray(), buffer.copyData()); + System.out.println(buffer); + + buffer.reset(); + reverseIndexReader.documents(keywordLexicon.getReadOnly("2"), ReverseIndexEntrySourceBehavior.DO_PREFER).read(buffer); + assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).toArray(), buffer.copyData()); + System.out.println(buffer); + + buffer.reset(); + reverseIndexReader.documents(keywordLexicon.getReadOnly("3"), ReverseIndexEntrySourceBehavior.DO_PREFER).read(buffer); + assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).toArray(), buffer.copyData()); + System.out.println(buffer); + + buffer.reset(); + var es = reverseIndexReader.documents(keywordLexicon.getReadOnly("7"), ReverseIndexEntrySourceBehavior.DO_PREFER); + do { + buffer.reset(); + es.read(buffer); + System.out.println(buffer); + } while (es.hasMore()); + + + TestUtil.clearTempDir(dataDir); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java new file mode 100644 index 00000000..6efcbbd3 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java @@ -0,0 +1,161 @@ +package nu.marginalia.wmsa.edge.index.postings.reverse; + +import lombok.SneakyThrows; +import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.buffer.LongQueryBuffer; +import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.test.TestUtil; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; +import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.postings.reverse.query.ReverseIndexEntrySourceBehavior; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.stream.IntStream; +import java.util.stream.LongStream; + +class ReverseIndexConverterTest2 { + + KeywordLexicon keywordLexicon; + SearchIndexJournalWriterImpl writer; + + Path indexFile; + Path wordsFile1; + Path urlsFile1; + Path dictionaryFile; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + Path dataDir; + private Path wordsFile; + private Path docsFile; + + int workSetSize = 8192; + int workSetStart = 8000; + + @BeforeEach + @SneakyThrows + void setUp() { + dictionaryFile = Files.createTempFile("tmp", ".dict"); + dictionaryFile.toFile().deleteOnExit(); + + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18)); + keywordLexicon.getOrInsert("0"); + + indexFile = Files.createTempFile("tmp", ".idx"); + indexFile.toFile().deleteOnExit(); + writer = new SearchIndexJournalWriterImpl(keywordLexicon, indexFile.toFile()); + + wordsFile1 = Files.createTempFile("words1", ".idx"); + urlsFile1 = Files.createTempFile("urls1", ".idx"); + + dataDir = Files.createTempDirectory(getClass().getSimpleName()); + + for (int i = 1; i < workSetSize; i++) { + if (i < workSetStart) { + keywordLexicon.getOrInsert(Integer.toString(i)); + } + else { + createEntry(writer, keywordLexicon, i); + } + } + + keywordLexicon.commitToDisk(); + Thread.sleep(1000); + writer.forceWrite(); + + var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)); + + wordsFile = dataDir.resolve("words.dat"); + docsFile = dataDir.resolve("docs.dat"); + } + + @AfterEach + public void tearDown() { + TestUtil.clearTempDir(dataDir); + } + + public int[] getFactorsI(int id) { + return IntStream.rangeClosed(1, id-1).toArray(); + } + public long[] getFactorsL(int id) { + return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); + } + + long createId(long url, long domain) { + return (domain << 32) | url; + } + public void createEntry(SearchIndexJournalWriterImpl writer, KeywordLexicon keywordLexicon, int id) { + int[] factors = getFactorsI(id); + var header = new SearchIndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5); + + long[] data = new long[factors.length*2]; + for (int i = 0; i < factors.length; i++) { + data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); + data[2*i + 1] = (i % 21 != 0) ? 0 : -factors[i]; + } + + writer.put(header, new SearchIndexJournalEntry(data)); + } + + @Test + void testRev2() throws IOException { + + Path tmpDir = Path.of("/tmp"); + + new ReverseIndexConverter(tmpDir, new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)), wordsFile, docsFile).convert(); + + var reverseReader = new ReverseIndexReader(wordsFile, docsFile); + + for (int i = workSetStart; i < workSetSize; i++) { + + var es = reverseReader.documents(i, ReverseIndexEntrySourceBehavior.DO_PREFER); + LongQueryBuffer lqb = new LongQueryBuffer(100); + while (es.hasMore()) { + lqb.reset(); + es.read(lqb); + System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end))); + } + System.out.println("--"); + } + + TestUtil.clearTempDir(dataDir); + } + + + @Test + void testRevP() throws IOException { + + Path tmpDir = Path.of("/tmp"); + + new ReverseIndexConverter(tmpDir, new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile), null, ReverseIndexPriorityParameters::filterPriorityRecord), wordsFile, docsFile).convert(); + + var reverseReader = new ReverseIndexReader(wordsFile, docsFile); + + for (int i = workSetStart; i < workSetSize; i++) { + + var es = reverseReader.documents(i, ReverseIndexEntrySourceBehavior.DO_PREFER); + LongQueryBuffer lqb = new LongQueryBuffer(100); + while (es.hasMore()) { + lqb.reset(); + es.read(lqb); + System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end))); + } + System.out.println("--"); + } + + TestUtil.clearTempDir(dataDir); + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroBTreeCachedIndexTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroBTreeCachedIndexTest.java deleted file mode 100644 index 0daeecbe..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroBTreeCachedIndexTest.java +++ /dev/null @@ -1,52 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -class MicroBTreeCachedIndexTest { - MicroCache mc; - - @BeforeEach - public void setUp() { - mc = new MicroCache(8); - } - - @Test - public void testSunnyDay() { - mc.set(1, 2); - mc.set(2, 4); - mc.set(4, 8); - mc.set(8, 16); - assertEquals(2, mc.get(1)); - assertEquals(4, mc.get(2)); - assertEquals(8, mc.get(4)); - assertEquals(16, mc.get(8)); - assertEquals(MicroCache.BAD_VALUE, mc.get(16)); - } - - @Test - public void testRollOver() { - mc.set(1, 2); - mc.set(2, 4); - mc.set(4, 8); - mc.set(8, 16); - mc.set(16, 32); - mc.set(32, 64); - mc.set(64, 128); - mc.set(128, 256); - mc.set(256, 512); - - assertEquals(MicroCache.BAD_VALUE, mc.get(1)); - assertEquals(4, mc.get(2)); - assertEquals(8, mc.get(4)); - assertEquals(16, mc.get(8)); - assertEquals(32, mc.get(16)); - assertEquals(64, mc.get(32)); - assertEquals(128, mc.get(64)); - assertEquals(256, mc.get(128)); - assertEquals(512, mc.get(256)); - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java deleted file mode 100644 index 48ee7c83..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java +++ /dev/null @@ -1,221 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -import lombok.SneakyThrows; -import nu.marginalia.util.dict.DictionaryHashMap; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; -import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; - -class DictionaryWriterTest { - /* - @Test @Disabled - - public void analyze2() throws IOException { - System.out.println("Loading dictionary"); - var dr = new DictionaryReader(null, new File("/home/vlofgren/dictionary.dat")); - System.out.println("Loading indices"); - var reader = new SearchIndexReader(new SearchIndex("test", Path.of("/tmp"), - new File("/tmp/urls-0"), - new File("/tmp/words-0")), - new SearchIndex("test", Path.of("/tmp"), - new File("/tmp/urls-24"), - new File("/tmp/words-24"))); - System.out.println("Gogo"); - long hitsTotal = 0L; - try (var wr = new PrintWriter(new FileOutputStream("/home/vlofgren/words-count"))) { - hitsTotal = dr.stream().mapToLong(w -> { - long hits = reader.numHits(dr.get(w)); - wr.printf("%08d %s\n", hits, w); - return hits; - }).sum(); - } - System.out.println(hitsTotal); - } - */ - @Test @Disabled @SneakyThrows - public void convert() { - new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"), - new File("/home/vlofgren/page-index-0.dat"), - new File("/tmp/words-0"), - new File("/tmp/urls-0"), - new SearchIndexPartitioner(null), - val -> false); - } - - KeywordLexiconJournal createJournal(File f) throws IOException { - return new KeywordLexiconJournal(f); - } - - @SneakyThrows - @Test - @Disabled - void test() { - try (var dict = new KeywordLexicon(createJournal(Path.of("/home/vlofgren/Code/data/dictionary.dat").toFile()), new DictionaryHashMap(1L<<16))) { - wait(); - } - } - - - @SneakyThrows - @Test - void getFold() { - var path = Files.createTempFile("dict", ".tmp"); - try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { - dict.getOrInsert("hic"); - dict.getOrInsert("hac"); - dict.commitToDisk(); - dict.getOrInsert("quae"); - dict.getOrInsert("quis"); - dict.getOrInsert("quem1"); - dict.getOrInsert("quem2"); - dict.getOrInsert("quem3"); - dict.getOrInsert("quem4"); - dict.getOrInsert("quem5"); - dict.getOrInsert("quem6"); - dict.getOrInsert("quem7"); - dict.getOrInsert("quem8"); - dict.getOrInsert("quem9"); - dict.getOrInsert("quem10"); - dict.getOrInsert("cuis"); - dict.getOrInsert("haec_hic"); - dict.getOrInsert("hoc_hac_cuis"); - dict.commitToDisk(); - assertNotEquals(0, dict.getOrInsert("hac")); - assertEquals(0, dict.getOrInsert("hic")); - } - - try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { - assertNotEquals(0, dict.getOrInsert("hoc")); - assertEquals(0, dict.getOrInsert("hic")); - } - - path.toFile().delete(); - } - - @SneakyThrows - @Test - void get() { - var path = Files.createTempFile("dict", ".tmp"); - try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { - dict.getOrInsert("hic"); - dict.getOrInsert("hac"); - dict.getOrInsert("haec"); - dict.getOrInsert("hoc"); - dict.commitToDisk(); - dict.getOrInsert("quae"); - dict.getOrInsert("quis"); - dict.getOrInsert("quem"); - dict.getOrInsert("cuis"); - dict.commitToDisk(); - assertNotEquals(0, dict.getOrInsert("hac")); - assertEquals(0, dict.getOrInsert("hic")); - } - - try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { - assertNotEquals(0, dict.getOrInsert("hoc")); - assertEquals(0, dict.getOrInsert("hic")); - } - - path.toFile().delete(); - } - - @SneakyThrows - @Test - void getDoubleWrite() { - var path = Files.createTempFile("dict", ".tmp"); - - try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { - dict.commitToDisk(); - } - - try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { - dict.getOrInsert("hic"); - dict.getOrInsert("hac"); - dict.getOrInsert("haec"); - dict.getOrInsert("hoc"); - dict.getOrInsert("quae"); - dict.getOrInsert("quis"); - dict.getOrInsert("quem"); - dict.getOrInsert("cuis"); - dict.commitToDisk(); - assertNotEquals(0, dict.getOrInsert("hac")); - assertEquals(0, dict.getOrInsert("hic")); - } - - var dict = new KeywordLexiconReadOnlyView(new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))); - - assertNotEquals(0, dict.get("hoc")); - assertEquals(0, dict.get("hic")); - - path.toFile().delete(); - } - - @SneakyThrows - @Test - void getDoubleWrite2() { - var path = Files.createTempFile("dict", ".tmp"); - - try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { - dict.getOrInsert("hic"); - dict.getOrInsert("hac"); - dict.getOrInsert("haec"); - dict.getOrInsert("hoc"); - dict.getOrInsert("quae"); - dict.getOrInsert("quis"); - dict.getOrInsert("quem"); - dict.getOrInsert("cuis"); - dict.commitToDisk(); - assertNotEquals(0, dict.getOrInsert("hac")); - assertEquals(0, dict.getOrInsert("hic")); - } - - try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { - dict.getOrInsert("fe"); - dict.getOrInsert("fi"); - dict.getOrInsert("fo"); - dict.getOrInsert("fum"); - dict.commitToDisk(); - assertNotEquals(0, dict.getOrInsert("hac")); - assertEquals(0, dict.getOrInsert("hic")); - } - - try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) { - dict.getOrInsert("bip"); - dict.getOrInsert("bap"); - dict.commitToDisk(); - } - - - var dict = new KeywordLexiconReadOnlyView(new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))); - - assertEquals(0, dict.get("hic")); - assertEquals(1, dict.get("hac")); - assertEquals(2, dict.get("haec")); - assertEquals(3, dict.get("hoc")); - assertEquals(4, dict.get("quae")); - assertEquals(5, dict.get("quis")); - assertEquals(6, dict.get("quem")); - assertEquals(7, dict.get("cuis")); - assertEquals(8, dict.get("fe")); - assertEquals(9, dict.get("fi")); - assertEquals(10, dict.get("fo")); - assertEquals(11, dict.get("fum")); - assertEquals(12, dict.get("bip")); - assertEquals(13, dict.get("bap")); - path.toFile().delete(); - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java new file mode 100644 index 00000000..a59e4fd0 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java @@ -0,0 +1,200 @@ +package nu.marginalia.wmsa.edge.index.service; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; +import nu.marginalia.wmsa.edge.index.model.QueryStrategy; +import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; +import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import spark.Spark; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Execution(SAME_THREAD) +public class EdgeIndexIntegrationTest { + + @Inject + Initialization initialization; + @Inject + EdgeIndexLexiconService lexiconService; + @Inject + EdgeIndexQueryService queryService; + @Inject + EdgeIndexOpsService opsService; + + @Inject + SearchIndexControl searchIndexControl; + + EdgeIndexIntegrationTestModule testModule; + + @BeforeEach + public void setUp() throws IOException, InterruptedException { + + testModule = new EdgeIndexIntegrationTestModule(); + Guice.createInjector(testModule).injectMembers(this); + + initialization.setReady(); + searchIndexControl.initialize(initialization); + } + + @AfterEach + public void tearDown() throws IOException { + testModule.cleanUp(); + + Spark.stop(); + } + + @Test + public void willItBlend() throws Exception { + for (int i = 1; i < 512; i++) { + loadData(i); + } + searchIndexControl.getIndexWriter(0).flushWords(); + Thread.sleep(100); + + opsService.reindexEndpoint(null, null); + + var rsp = queryService.query( + EdgeSearchSpecification.builder() + .timeoutMs(Integer.MAX_VALUE) + .fetchSize(4000) + .limitTotal(10) + .limitByDomain(10) + .queryStrategy(QueryStrategy.SENTENCE) + .year(SpecificationLimit.none()) + .quality(SpecificationLimit.none()) + .size(SpecificationLimit.none()) + .domains(new ArrayList<>()) + .searchSetIdentifier(SearchSetIdentifier.NONE) + .subqueries(List.of(new EdgeSearchSubquery( + List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() + ))).build()); + + Assertions.assertArrayEquals( + new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 }, + rsp.results + .stream() + .mapToInt(EdgeSearchResultItem::getUrlIdInt) + .toArray()); + } + + + @Test + public void testDomainQuery() throws Exception { + for (int i = 1; i < 512; i++) { + loadDataWithDomain(i/100, i); + } + searchIndexControl.getIndexWriter(0).flushWords(); + Thread.sleep(100); + + opsService.reindexEndpoint(null, null); + + var rsp = queryService.query( + EdgeSearchSpecification.builder() + .timeoutMs(Integer.MAX_VALUE) + .fetchSize(4000) + .limitTotal(10) + .limitByDomain(10) + .year(SpecificationLimit.none()) + .quality(SpecificationLimit.none()) + .size(SpecificationLimit.none()) + .queryStrategy(QueryStrategy.SENTENCE) + .domains(List.of(2)) + .subqueries(List.of(new EdgeSearchSubquery( + List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() + ))).build()); + Assertions.assertArrayEquals( + new int[] { 210, 270 }, + rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray()); + } + + @Test + public void testYearQuery() throws Exception { + for (int i = 1; i < 512; i++) { + loadData(i); + } + searchIndexControl.getIndexWriter(0).flushWords(); + Thread.sleep(100); + + opsService.reindexEndpoint(null, null); + + var rsp = queryService.query( + EdgeSearchSpecification.builder() + .timeoutMs(Integer.MAX_VALUE) + .fetchSize(4000) + .limitTotal(10) + .limitByDomain(10) + .quality(SpecificationLimit.none()) + .year(SpecificationLimit.equals(1998)) + .size(SpecificationLimit.none()) + .queryStrategy(QueryStrategy.SENTENCE) + .searchSetIdentifier(SearchSetIdentifier.NONE) + .subqueries(List.of(new EdgeSearchSubquery( + List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList() + )) + ).build()); + + Assertions.assertArrayEquals( + new int[] { 12, 72, 132, 192, 252, 312, 372, 432, 492, 32 }, + rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray()); + } + + + + public void loadData(int id) { + int[] factors = IntStream + .rangeClosed(1, id) + .filter(v -> (id % v) == 0) + .toArray(); + + long fullId = id | ((long) (32 - (id % 32)) << 32); + + var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, id % 5, id, id % 20, (byte) 0).encode()); + + long[] data = new long[factors.length*2]; + for (int i = 0; i < factors.length; i++) { + data[2*i] = lexiconService.getOrInsertWord(Integer.toString(factors[i])); + data[2*i + 1] = new EdgePageWordMetadata(i, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); + } + + lexiconService.putWords(0, header, new SearchIndexJournalEntry(data)); + } + + public void loadDataWithDomain(int domain, int id) { + int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); + var header = new SearchIndexJournalEntryHeader(factors.length, id | ((long) domain << 32), EdgePageDocumentsMetadata.defaultValue()); + + long[] data = new long[factors.length*2]; + for (int i = 0; i < factors.length; i++) { + data[2*i] = lexiconService.getOrInsertWord(Integer.toString(factors[i])); + data[2*i + 1] = new EdgePageWordMetadata(i % 20, i, i, EnumSet.of(EdgePageWordFlags.Title)).encode(); + } + + lexiconService.putWords(0, header, new SearchIndexJournalEntry(data)); + } + +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java new file mode 100644 index 00000000..46d8228c --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java @@ -0,0 +1,66 @@ +package nu.marginalia.wmsa.edge.index.service; + +import com.google.inject.AbstractModule; +import com.google.inject.name.Names; +import nu.marginalia.util.test.TestUtil; +import nu.marginalia.wmsa.edge.index.IndexServicesFactory; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny; +import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; +import org.mockito.Mockito; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Random; + +import static org.mockito.Mockito.when; + +public class EdgeIndexIntegrationTestModule extends AbstractModule { + Path workDir; + Path slowDir; + Path fastDir; + + Random random = new Random(); + + public EdgeIndexIntegrationTestModule() throws IOException { + workDir = Files.createTempDirectory(EdgeIndexIntegrationTest.class.getSimpleName()); + slowDir = workDir.resolve("slow"); + fastDir = workDir.resolve("fast"); + + Files.createDirectory(slowDir); + Files.createDirectory(fastDir); + } + + public void cleanUp() { + TestUtil.clearTempDir(workDir); + } + + @Override + protected void configure() { + + try { + bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"), + slowDir, fastDir, + 1L<<24, + null + )); + + EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class); + when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny()); + + bind(EdgeIndexSearchSetsService.class).toInstance(setsServiceMock); + + bind(String.class).annotatedWith(Names.named("service-host")).toInstance("127.0.0.1"); + bind(Integer.class).annotatedWith(Names.named("service-port")).toProvider(this::randomPort); + } catch (IOException e) { + throw new RuntimeException(e); + } + + + } + + private int randomPort() { + return random.nextInt(10000, 30000); + } +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java deleted file mode 100644 index 99785031..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java +++ /dev/null @@ -1,255 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -import lombok.SneakyThrows; -import nu.marginalia.util.multimap.MultimapFileLong; -import org.apache.commons.lang3.ArrayUtils; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -class MultimapFileTest { - File tmp; - File tmp2; - - @BeforeEach @SneakyThrows - public void setUp() { - - tmp = Files.createTempFile("test", "test").toFile(); - tmp2 = Files.createTempFile("test", "test").toFile(); - - } - @AfterEach - public void tearDown() { - tmp.delete(); - tmp2.delete(); - } - - @SneakyThrows - @Test - void transfer() { - ByteBuffer buf = ByteBuffer.allocateDirect(77); - try (var source = MultimapFileLong.forOutput(tmp.toPath(), 1024); - var dest = new MultimapFileLong(tmp, FileChannel.MapMode.READ_WRITE, 1024, 8) - ) { - for (int i = 0; i < 1024; i++) { - source.put(i, i); - } - source.force(); - dest.transferFromFileChannel(new RandomAccessFile(tmp, "r").getChannel(), 11, 55, 100); - for (int i = 0; i < 45; i++) { - System.out.println("source=" + (11+i) + ", dest = " + dest.get(11+i)); - assertEquals(55+i, dest.get(11+i)); - } - } - } - - @SneakyThrows - @Test - void put() { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); - for (int i = 0; i < 32; i++) { - file.put(i, i); - } - for (int i = 0; i < 32; i++) { - assertEquals(i, file.get(i)); - } - } - - @SneakyThrows - @Test - void read() { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); - for (int i = 0; i < 32; i++) { - file.put(i, i); - } - - for (int i = 0; i < 32-6; i++) { - long[] vals = new long[6]; - file.read(vals, i); - for (int j = 0; j < 6; j++) { - assertEquals(i+j, vals[j]); - } - } - - } - - @Test - void write() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); - - for (int i = 0; i < 32-6; i++) { - file.write(new long[] { 0,1,2,3,4,5}, i); - for (int j = 0; j < 6; j++) { - assertEquals(j, file.get(i+j)); - } - } - - } - - @Test - void testQuickSort() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); - var sorter = file.createSorter(Path.of("/tmp"), 16, 2); - - for (int start = 0; start < 8; start+=2) { - System.out.println("~"); - for (int end = start; end < 128; end+=2) { - for (int i = 0; i < 128; i+=2) { - file.put(i, -i/2); - file.put(i+1, i/2); - } - sorter.quickSortLH(start, end); - for (int i = start+2; i < end; i+=2) { - - System.out.println("**" + i); - System.out.println(file.get(i-2)); - System.out.println(file.get(i-1)); - System.out.println(file.get(i)); - System.out.println(file.get(i+1)); - - assertTrue(file.get(i-2) <= file.get(i)); - assertEquals(file.get(i+1), -file.get(i)); - } - System.out.println("~"); - } - } - - } - - @Test - void testSort() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); - var sorter = file.createSorter(Path.of("/tmp"), 1024, 2); - - long[] values = new long[65536]; - for (int i = 0; i < values.length; i++) { - values[i] = i; - } - ArrayUtils.shuffle(values); - - int start = 6; - System.out.println(start); - for (int end = start+2; end < values.length; end+=100) { - - for (long i = 0; i < end+1; i+=2) { - file.put(i, values[(int)i/2]); - file.put(i+1, i/2); - } - - - file.put(start-2, 100000); - file.put(end, 1); - sorter.sortRange(start, end); - - for (int i = start+2; i < end; i+=2) { - assertTrue(file.get(i-2) < file.get(i)); - } - - assertEquals(100000, file.get(start-2)); - assertEquals(1, file.get(end)); - } - - } - - @Test - void testInsertionSort() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); - var sorter = file.createSorter(Path.of("/tmp"), 16, 2); - - for (int start = 2; start < 8; start+=2) { - for (int end = start+2; end < 126; end+=2) { - for (int i = 0; i < 128; i+=2) { - file.put(i, -(128-i/2)); - file.put(i+1, (128-i/2)); - } - file.put(0, 0xFFFF_FFFFL); - file.put(end, 0x7FFF_FFFFL); - sorter.insertionSort(start, (end - start)/2); - assertEquals(0xFFFF_FFFFL, file.get(0)); - assertEquals(file.get(end), 0x7FFF_FFFFL); - for (int i = start+2; i < end; i+=2) { - assertTrue(file.get(i-2) <= file.get(i)); - assertEquals(file.get(i+1), -file.get(i)); - } - } - } - } - - @Test - void testMergeSort() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); - var sorter = file.createSorter(Path.of("/tmp"), 16, 2); - - for (int start = 0; start < 512; start+=18) { - System.out.println(start); - for (int end = start+2; end < 8192; end+=68) { - for (int i = 0; i < 8192; i+=2) { - file.put(i, -i/2); - file.put(i+1, i/2); - } - sorter.mergeSort(start, end-start); - - assertEquals(file.get(start+1), -file.get(start)); - for (int i = start+2; i < end; i+=2) { -// System.out.println(file.get(i-2) + "," + file.get(i)); - assertTrue(file.get(i-2) <= file.get(i)); - -// System.out.println(file.get(i+1) + ":" + -file.get(i)); - assertEquals(file.get(i+1), -file.get(i)); - } - } - } - } - - @Test - void sortInternal() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); - var sorter = file.createSorter(Path.of("/tmp"), 16, 1); - var searcher = file.createSearcher(); - for (int i = 0; i < 32; i++) { - file.put(i, 32-i); - } - - sorter.sortRange( 2, 16); - - for (int i = 2+1; i < 16; i++) { - assertTrue(file.get(i) > file.get(i-1)); - assertTrue(searcher.binarySearchTest(file.get(i), 2, 16)); - } - } - - @Test - void sortExternal() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); - var sorter = file.createSorter(Path.of("/tmp"), 2, 1); - var searcher = file.createSearcher(); - - for (int i = 0; i < 32; i++) { - file.put(i, 32-i); - } - - sorter.sortRange( 2, 16); - file.force(); - - for (int i = 2+1; i < 16; i++) { - assertTrue(file.get(i) > file.get(i-1)); - assertTrue(searcher.binarySearchTest(file.get(i), 2, 16)); - } - } - - - @Test - void close() { - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java deleted file mode 100644 index aaa072e1..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java +++ /dev/null @@ -1,163 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -import lombok.SneakyThrows; -import nu.marginalia.util.dict.DictionaryHashMap; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; - -class SearchIndexJournalWriterTest { - KeywordLexicon keywordLexicon; - SearchIndexJournalWriterImpl writer; - - Path indexFile; - Path wordsFile1; - Path urlsFile1; - Path dictionaryFile; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @BeforeEach @SneakyThrows - void setUp() { - dictionaryFile = Files.createTempFile("tmp", ".dict"); - dictionaryFile.toFile().deleteOnExit(); - - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<16)); - - indexFile = Files.createTempFile("tmp", ".idx"); - indexFile.toFile().deleteOnExit(); - writer = new SearchIndexJournalWriterImpl(keywordLexicon, indexFile.toFile()); - - wordsFile1 = Files.createTempFile("words1", ".idx"); - urlsFile1 = Files.createTempFile("urls1", ".idx"); - } - - @SneakyThrows - @AfterEach - void tearDown() { - keywordLexicon.close(); - writer.close(); - indexFile.toFile().delete(); - dictionaryFile.toFile().delete(); - urlsFile1.toFile().delete(); - wordsFile1.toFile().delete(); - } - - @Test - void put() throws IOException, InterruptedException { - - for (int i = 0; i < 512; i++) { - if (i % 2 == 0) { - writer.put(new SearchIndexJournalEntryHeader(4, i, IndexBlock.Words_1), - new SearchIndexJournalEntry(new long[]{keywordLexicon.getOrInsert("one"), - 0x000000, - keywordLexicon.getOrInsert("two"), - 0xFFFFFF})); - } - else { - writer.put(new SearchIndexJournalEntryHeader(2, i, IndexBlock.Words_1), - new SearchIndexJournalEntry(new long[]{keywordLexicon.getOrInsert("one"), - 0x000000})); - } - } - keywordLexicon.commitToDisk(); - Thread.sleep(1000); - writer.forceWrite(); - - var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(indexFile)); - - for (var entry : reader) { - logger.info("{}, {} {}", entry, entry.urlId(), entry.domainId()); - for (var record : entry.readEntry()) { - logger.info("{}", record); - } - } - - new SearchIndexConverter(IndexBlock.Words_1, 7, Path.of("/tmp"), - indexFile.toFile(), - wordsFile1.toFile(), - urlsFile1.toFile(), - new SearchIndexPartitioner(null), (url) -> false) - .convert(); - - MultimapFileLong mmf = MultimapFileLong.forReading(urlsFile1); - for (int i = 0; i < 1056; i++) { - System.out.println(i + ":" + mmf.get(i)); - } - try (var idx = new SearchIndex("test", urlsFile1.toFile(), wordsFile1.toFile())) { - for (String s : List.of("one", "two", "3")) { - System.out.println("***" + s); - var range = idx.rangeForWord(keywordLexicon.getOrInsert(s)); - System.out.println(range); - - System.out.println(1 + "? " + range.hasUrl(1)); - System.out.println(2 + "? " + range.hasUrl(2)); - - var source = range.asEntrySource(); - System.out.println(source); - - } - - } catch (Exception e) { - throw new RuntimeException(e); - } - - } - - @Test - void testWeirdScenario() throws IOException, InterruptedException { - long[] vals = new long[]{3818531806586L, 1696527885824L, 3818531806586L, 1679348016640L, 3818531806611L, 1168242909952L, 3818531806611L, 1168242909952L, 4316748027839L, 549761847552L, 47240643248522L, 285873040601600L, 51101820141195L, 1099517497600L, 51101820141295L, 549762863360L}; - - for (int v = 0; v < vals.length / 2; v++) { - writer.put(new SearchIndexJournalEntryHeader(4, vals[v * 2], IndexBlock.Words_1), - new SearchIndexJournalEntry(new long[]{keywordLexicon.getOrInsert("one"), vals[v * 2 + 1]})); - } - - keywordLexicon.commitToDisk(); - Thread.sleep(1000); - writer.forceWrite(); - - var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(indexFile)); - - for (var entry : reader) { - logger.info("{}, {} {}", entry, entry.urlId(), entry.domainId()); - for (var record : entry.readEntry()) { - logger.info("{}", record); - } - } - - new SearchIndexConverter(IndexBlock.Words_1, 7, Path.of("/tmp"), - indexFile.toFile(), - wordsFile1.toFile(), - urlsFile1.toFile(), - new SearchIndexPartitioner(null), (url) -> false) - .convert(); - - try (var idx = new SearchIndex("test", urlsFile1.toFile(), wordsFile1.toFile())) { - var range = idx.rangeForWord(keywordLexicon.getOrInsert("one")); - long[] buffer = new long[128]; - - } - catch (Exception ex) { ex.printStackTrace(); } - - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/ByteFolderTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/ByteFolderTest.java deleted file mode 100644 index 2fc21ac1..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/ByteFolderTest.java +++ /dev/null @@ -1,35 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service.util; - -import nu.marginalia.util.ByteFolder; -import org.junit.jupiter.api.Test; - -import static nu.marginalia.util.ByteFolder.decodeBytes; -import static org.junit.jupiter.api.Assertions.*; - -class ByteFolderTest { - - @Test - void foldBytes() { - ByteFolder folder = new ByteFolder(); - // Edge cases - assertArrayEquals(new byte[]{1,0}, folder.foldBytes(0,0)); - assertArrayEquals(new int[]{Integer.MAX_VALUE-1,Integer.MAX_VALUE}, decodeBytes(folder.foldBytes(Integer.MAX_VALUE-1,Integer.MAX_VALUE))); - assertArrayEquals(new int[]{128, 1}, decodeBytes(folder.foldBytes(128,1))); - - // 1 byte boundary - for (int i = 0; i < 512; i++) { - for (int j = 0; j < 512; j++) { - assertArrayEquals(new int[]{i,j}, decodeBytes(folder.foldBytes(i,j)), "Discrepancy @ " + i + " ," + j ); - } - } - - // Scattershot - for (int i = 0; i < 1_000_000; i++) { - int p = (int) (Integer.MAX_VALUE * Math.random()); - int q = (int) (Integer.MAX_VALUE * Math.random()); - assertArrayEquals(new int[]{q,p}, decodeBytes(folder.foldBytes(q,p)), "Discrepancy @ " + q + " ," + p ); - } - - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java index 8e58b117..deb5c992 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java @@ -15,7 +15,7 @@ class RandomWriteFunnelTest { @Test public void test() { new File("/tmp/test.bin").delete(); - try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 10_000, 5001); + try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 5001); var out = new RandomAccessFile("/tmp/test.bin", "rw")) { for (int i = 10_000-1; i >= 0; i--) { System.out.println(i); @@ -40,7 +40,7 @@ class RandomWriteFunnelTest { public void testSparse() { new File("/tmp/test.bin").delete(); for (int j = 1; j <= 20; j++) { - try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 10, j); + try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j); var out = new RandomAccessFile("/tmp/test.bin", "rw")) { for (int i = 10 - 1; i >= 0; i -= 2) { funnel.put(i, 10 - i); @@ -73,7 +73,7 @@ class RandomWriteFunnelTest { public void testYuge() { new File("/tmp/test.bin").delete(); for (int j = 1; j <= 20; j++) { - try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 10, j); + try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), j); var out = new RandomAccessFile("/tmp/test.bin", "rw")) { for (int i = 10 - 1; i >= 0; i -= 2) { funnel.put(i, Long.MAX_VALUE - i); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java deleted file mode 100644 index 7290a01a..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java +++ /dev/null @@ -1,218 +0,0 @@ -package nu.marginalia.wmsa.edge.index.svc.query; - -import nu.marginalia.util.btree.BTreeQueryBuffer; -import nu.marginalia.util.btree.BTreeWriter; -import nu.marginalia.util.multimap.MultimapFileLong; -import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.reader.SearchIndexURLRange; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterAnyOf; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRangeReject; -import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRangeRetain; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.List; -import java.util.stream.LongStream; - -import static org.junit.jupiter.api.Assertions.*; - -class IndexQueryTest { - static Path file; - - static long threesOffset; - static long fivesOffset; - static long sevensOffset; - static long smallSeventeenOffset; - - // sz should be large enough to ensure the tree has multiple layers to shake out bugs - static int sz = 128*512*512*2; - - static MultimapFileLong mmf; - @BeforeAll - static void setUpAll() throws IOException { - file = Files.createTempFile(IndexQueryTest.class.getSimpleName(), ".dat"); - - try (var mmf = MultimapFileLong.forOutput(file, 10_000_000)) { - var bTreeWriter = new BTreeWriter(mmf, SearchIndexConverter.urlsBTreeContext); - - threesOffset = 0; - long written = 0; - written = bTreeWriter.write(0, sz / 2, w -> { - for (int i = 0; i < sz; i+=2) { - w.put(i, 3L*(i/2)); - w.put(i+1, i/2); - } - }); - - fivesOffset += written; - sevensOffset += written; - smallSeventeenOffset += written; - - written = bTreeWriter.write(fivesOffset, sz/2, w -> { - for (int i = 0; i < sz; i+=2) { - w.put(i, 5L*(i/2)); - w.put(i+1, (i/2)); - } - }); - - sevensOffset += written; - smallSeventeenOffset += written; - - written = bTreeWriter.write(sevensOffset, sz / 2, w -> { - for (int i = 0; i < sz; i+=2) { - w.put(i, 7L*(i/2)); - w.put(i+1, (i/2)); - } - }); - - smallSeventeenOffset += written; - - written = bTreeWriter.write(smallSeventeenOffset, 100, w -> { - for (int i = 0; i < 200; i+=2) { - w.put(i, 17L*(i/2)); - w.put(i+1, (i/2)); - } - }); - } - - mmf = MultimapFileLong.forReading(file); - - - } - - public SearchIndexURLRange threesRange() { - return new SearchIndexURLRange(mmf, threesOffset); - } - public SearchIndexURLRange fivesRange() { - return new SearchIndexURLRange(mmf, fivesOffset); - } - public SearchIndexURLRange sevensRange() { - return new SearchIndexURLRange(mmf, sevensOffset); - } - public SearchIndexURLRange seventeensRange() { - return new SearchIndexURLRange(mmf, smallSeventeenOffset); - } - - @AfterAll - static void tearDownAll() throws IOException { - mmf.close(); - Files.deleteIfExists(file); - } - - @Test - public void testMergeRanges() { - BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); - - IndexQuery query = new IndexQuery(List.of(seventeensRange().asEntrySource(), threesRange().asEntrySource())); - - /** Read from 17s range */ - - // 17s range is shorter and should read fully in one go - - query.getMoreResults(buffer); - assertFalse(buffer.isEmpty()); - assertArrayEquals(LongStream.range(0, 100).map(l -> l*17).toArray(), buffer.copyData()); - - /** Read from 3s range */ - - assertTrue(query.hasMore()); - query.getMoreResults(buffer); - assertArrayEquals(LongStream.range(0, 150).map(l -> l*3).toArray(), buffer.copyData()); - - /** Ensure 3s range is not flagged as finished */ - - assertFalse(buffer.isEmpty()); - assertTrue(query.hasMore()); - } - - @Test - public void test() { - BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); - - IndexQuery query = new IndexQuery(List.of(threesRange().asPrefixSource(102, 200))); - - /** Read from 3s range */ - - query.getMoreResults(buffer); - System.out.println(Arrays.toString(buffer.copyData())); - assertFalse(buffer.isEmpty()); - assertArrayEquals(LongStream.range(100, 200).filter(v -> (v % 3) == 0).toArray(), buffer.copyData()); - - } - - @Test - public void testInclude() { - BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); - - /** Set up filters */ - var es = threesRange().asEntrySource(); - es.skip(10000); - IndexQuery query = new IndexQuery(List.of(es)); - - query.addInclusionFilter(new QueryFilterBTreeRangeRetain(fivesRange())); - query.addInclusionFilter(new QueryFilterBTreeRangeRetain(sevensRange())); - - /** Do it */ - query.getMoreResults(buffer); - assertArrayEquals(LongStream.range(10000, 10150) - .map(l -> l*3) - .filter(l -> (l % 5) == 0) - .filter(l -> (l % 7) == 0) - .toArray(), buffer.copyData()); - } - - @Test - public void testIncludeReject() { - BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); - - /** Set up filters */ - var es = threesRange().asEntrySource(); - es.skip(10000); - IndexQuery query = new IndexQuery(List.of(es)); - - query.addInclusionFilter(new QueryFilterBTreeRangeRetain(fivesRange())); - query.addInclusionFilter(new QueryFilterBTreeRangeReject(sevensRange())); - - /** Do it */ - query.getMoreResults(buffer); - assertArrayEquals(LongStream.range(10000, 10150) - .map(l -> l*3) - .filter(l -> (l % 5) == 0) - .filter(l -> (l % 7) != 0) - .toArray(), buffer.copyData()); - } - - - @Test - public void testIncludeEither() { - BTreeQueryBuffer buffer = new BTreeQueryBuffer(300); - - /** Set up filters */ - var es = threesRange().asEntrySource(); - es.skip(10000); - IndexQuery query = new IndexQuery(List.of(es)); - query.addInclusionFilter(new QueryFilterAnyOf( - List.of(new QueryFilterBTreeRangeRetain(fivesRange()), - new QueryFilterBTreeRangeRetain(sevensRange())))); - - /** Do it */ - query.getMoreResults(buffer); - assertArrayEquals(LongStream.range(10000, 10150) - .map(l -> l*3) - .filter(l -> (l % 5) == 0 || (l % 7) == 0) - .toArray(), buffer.copyData()); - } - - @Test - public void testLoadMeta() { - long[] data = new long[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }; - threesRange().getMetadata(data); - System.out.println(Arrays.toString(data)); - - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java index 62ddde7b..97826605 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java @@ -36,6 +36,6 @@ class ArxivParserTest { var se = new SentenceExtractor(lm); - data.stream().map(meta -> documentKeywordExtractor.extractKeywords(se.extractSentences(meta.getAbstract(), meta.getTitle()), new KeywordMetadata(0))).limit(100).forEach(System.out::println); + data.stream().map(meta -> documentKeywordExtractor.extractKeywords(se.extractSentences(meta.getAbstract(), meta.getTitle()), new KeywordMetadata())).limit(100).forEach(System.out::println); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java index 47fa3b5d..a2b60163 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java @@ -12,10 +12,12 @@ import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; import nu.marginalia.wmsa.edge.model.EdgeDomain; import org.jsoup.Jsoup; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import java.io.IOException; +@Tag("slow") public class WikipediaTest { final LanguageModels lm = TestLanguageModels.getLanguageModels(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeDomainTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeDomainTest.java index 9493e638..13686997 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeDomainTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeDomainTest.java @@ -4,7 +4,7 @@ import org.junit.jupiter.api.Test; import java.net.URISyntaxException; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; class EdgeDomainTest { @@ -22,6 +22,16 @@ class EdgeDomainTest { assertEquals("http", domain.proto); assertEquals("l7072i3", domain.domain.subDomain); assertEquals("l7c.net", domain.domain.domain); + assertEquals("net", domain.domain.getTld()); + } + + @Test + public void testEndlessHorse() throws URISyntaxException { + var domain = new EdgeUrl("http://endless.horse/"); + assertEquals("http", domain.proto); + assertEquals("", domain.domain.subDomain); + assertEquals("endless.horse", domain.domain.domain); + assertEquals("horse", domain.domain.getTld()); } @Test @@ -30,6 +40,7 @@ class EdgeDomainTest { assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); assertEquals("uj.edu.pl", domain.domain.domain); + assertEquals("edu.pl", domain.domain.getTld()); } @@ -40,6 +51,7 @@ class EdgeDomainTest { assertEquals("www", domain.domain.subDomain); assertEquals("marginalia.nu", domain.domain.domain); assertEquals("http://www.marginalia.nu/", domain.toString()); + assertEquals("nu", domain.domain.getTld()); } @Test @@ -49,6 +61,7 @@ class EdgeDomainTest { assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); assertEquals("http://marginalia.co.uk/", domain.toString()); + assertEquals("co.uk", domain.domain.getTld()); } @Test @@ -58,6 +71,7 @@ class EdgeDomainTest { assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); assertEquals("http://withcandour.co.uk/", domain.toString()); + assertEquals("co.uk", domain.domain.getTld()); } @Test @@ -75,6 +89,7 @@ class EdgeDomainTest { assertEquals("http", domain.proto); assertEquals("abcf.de", domain.domain.domain); assertEquals("www.marginalia", domain.domain.subDomain); + assertEquals("de", domain.domain.getTld()); } @Test @@ -84,6 +99,7 @@ class EdgeDomainTest { assertEquals("", domain.domain.subDomain); assertEquals("marginalia.nu", domain.domain.domain); assertEquals("http://marginalia.nu/", domain.toString()); + assertEquals("nu", domain.domain.getTld()); } @Test @@ -93,6 +109,7 @@ class EdgeDomainTest { assertEquals("", domain.domain.subDomain); assertEquals("127.0.0.1", domain.domain.domain); assertEquals("https://127.0.0.1:8080/", domain.toString()); + assertEquals("IP", domain.domain.getTld()); } @Test @@ -102,5 +119,6 @@ class EdgeDomainTest { assertEquals("", domain.domain.subDomain); assertEquals("192.168.1.32", domain.domain.domain); assertEquals("https://192.168.1.32/", domain.toString()); + assertEquals("IP", domain.domain.getTld()); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java index ecb182f4..f3cbfa77 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordMetadataTest.java @@ -12,16 +12,16 @@ class EdgePageWordMetadataTest { @Test public void codecTest() { - verifyCodec("Vanilla case", new EdgePageWordMetadata(32, 0x7f0f0000, 5, 1, EnumSet.allOf(EdgePageWordFlags.class))); - verifyCodec("Position high", new EdgePageWordMetadata(32, 0xff0f0000, 5, 1, EnumSet.allOf(EdgePageWordFlags.class))); - verifyCodec("No flags", new EdgePageWordMetadata(32, 0xff0f0000, 5, 1, EnumSet.noneOf(EdgePageWordFlags.class))); - System.out.println(new EdgePageWordMetadata(32, 0x7f0f0005, 5, 1, EnumSet.allOf(EdgePageWordFlags.class))); - System.out.println(new EdgePageWordMetadata(32, 0xff0f0013, 5, 1, EnumSet.noneOf(EdgePageWordFlags.class))); + verifyCodec("Vanilla case", new EdgePageWordMetadata(32, 0x7f0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class))); + verifyCodec("Position high", new EdgePageWordMetadata(32, 0xff0f0000, 1, EnumSet.allOf(EdgePageWordFlags.class))); + verifyCodec("No flags", new EdgePageWordMetadata(32, 0xff0f0000, 1, EnumSet.noneOf(EdgePageWordFlags.class))); + System.out.println(new EdgePageWordMetadata(32, 0x7f0f0005, 1, EnumSet.allOf(EdgePageWordFlags.class))); + System.out.println(new EdgePageWordMetadata(32, 0xff0f0013, 1, EnumSet.noneOf(EdgePageWordFlags.class))); } @Test public void testClampTfIdfLow() { - var original = new EdgePageWordMetadata(0x8000FFFF, 0, 5, 1, EnumSet.noneOf(EdgePageWordFlags.class)); + var original = new EdgePageWordMetadata(0x8000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class)); var encoded = new EdgePageWordMetadata(original.encode()); assertEquals(original.positions(), encoded.positions()); @@ -30,7 +30,7 @@ class EdgePageWordMetadataTest { @Test public void testClampTfIdfHigh() { - var original = new EdgePageWordMetadata(0x7000FFFF, 0, 5, 1, EnumSet.noneOf(EdgePageWordFlags.class)); + var original = new EdgePageWordMetadata(0x7000FFFF, 0, 1, EnumSet.noneOf(EdgePageWordFlags.class)); var encoded = new EdgePageWordMetadata(original.encode()); assertEquals(original.positions(), encoded.positions()); @@ -39,7 +39,7 @@ class EdgePageWordMetadataTest { @Test public void testClampCountLow() { - var original = new EdgePageWordMetadata(40, 0, 5, -1, EnumSet.noneOf(EdgePageWordFlags.class)); + var original = new EdgePageWordMetadata(40, 0, -1, EnumSet.noneOf(EdgePageWordFlags.class)); var encoded = new EdgePageWordMetadata(original.encode()); assertEquals(original.positions(), encoded.positions()); @@ -48,7 +48,7 @@ class EdgePageWordMetadataTest { @Test public void testClampCountHigh() { - var original = new EdgePageWordMetadata(40, 0, 5, 17, EnumSet.noneOf(EdgePageWordFlags.class)); + var original = new EdgePageWordMetadata(40, 0, 17, EnumSet.noneOf(EdgePageWordFlags.class)); var encoded = new EdgePageWordMetadata(original.encode()); assertEquals(original.positions(), encoded.positions()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryFactoryTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryFactoryTest.java new file mode 100644 index 00000000..987c258d --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryFactoryTest.java @@ -0,0 +1,127 @@ +package nu.marginalia.wmsa.edge.search.query; + +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; +import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; +import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimitType; +import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; +import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; +import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; +import nu.marginalia.wmsa.edge.search.valuation.SearchResultValuator; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class QueryFactoryTest { + + static QueryFactory queryFactory; + + @BeforeAll + public static void setUpAll() throws IOException { + + var lm = WmsaHome.getLanguageModels(); + var tfd = new TermFrequencyDict(lm); + + queryFactory = new QueryFactory(lm, + tfd, + new EnglishDictionary(tfd), + new NGramBloomFilter(lm), + new SearchResultValuator(tfd), + null + ); + } + + public EdgeSearchSpecification parseAndGetSpecs(String query) { + return queryFactory.createQuery( + new EdgeUserSearchParameters(query, EdgeSearchProfile.CORPO, SearchJsParameter.DEFAULT) + ).specs; + } + + @Test + public void testParseNoSpecials() { + var year = parseAndGetSpecs("in the year 2000").year; + var size = parseAndGetSpecs("in the year 2000").size; + var quality = parseAndGetSpecs("in the year 2000").quality; + + assertEquals(SpecificationLimitType.NONE, year.type()); + assertEquals(SpecificationLimitType.NONE, size.type()); + assertEquals(SpecificationLimitType.NONE, quality.type()); + } + + + @Test + public void testParseYearEq() { + var year = parseAndGetSpecs("year=2000").year; + assertEquals(SpecificationLimitType.EQUALS, year.type()); + assertEquals(2000, year.value()); + } + + @Test + public void testParseYearLt() { + var year = parseAndGetSpecs("year<2000").year; + assertEquals(SpecificationLimitType.LESS_THAN, year.type()); + assertEquals(2000, year.value()); + } + + @Test + public void testParseYearGt() { + var year = parseAndGetSpecs("year>2000").year; + assertEquals(SpecificationLimitType.GREATER_THAN, year.type()); + assertEquals(2000, year.value()); + } + + @Test + public void testParseSizeEq() { + var size = parseAndGetSpecs("size=2000").size; + assertEquals(SpecificationLimitType.EQUALS, size.type()); + assertEquals(2000, size.value()); + } + + @Test + public void testParseSizeLt() { + var size = parseAndGetSpecs("size<2000").size; + assertEquals(SpecificationLimitType.LESS_THAN, size.type()); + assertEquals(2000, size.value()); + } + + @Test + public void testParseSizeGt() { + var size = parseAndGetSpecs("size>2000").size; + assertEquals(SpecificationLimitType.GREATER_THAN, size.type()); + assertEquals(2000, size.value()); + } + + + @Test + public void testParseQualityEq() { + var quality = parseAndGetSpecs("q=2000").quality; + assertEquals(SpecificationLimitType.EQUALS, quality.type()); + assertEquals(2000, quality.value()); + } + + @Test + public void testParseQualityLt() { + var quality = parseAndGetSpecs("q<2000").quality; + assertEquals(SpecificationLimitType.LESS_THAN, quality.type()); + assertEquals(2000, quality.value()); + } + + @Test + public void testParseQualityGt() { + var quality = parseAndGetSpecs("q>2000").quality; + assertEquals(SpecificationLimitType.GREATER_THAN, quality.type()); + assertEquals(2000, quality.value()); + } + + @Test + public void testPriorityTerm() { + var subquery = parseAndGetSpecs("physics ?tld:edu").subqueries.iterator().next(); + assertEquals(List.of("tld:edu"), subquery.searchTermsPriority); + assertEquals(List.of("physics"), subquery.searchTermsInclude); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java index 7defb7d7..734415e9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java @@ -4,21 +4,23 @@ import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.stream.Collectors; +import static org.junit.jupiter.api.Assertions.assertEquals; + class QueryParserTest { - private QueryParser parser; + private static QueryParser parser; private static TermFrequencyDict dict; private static EnglishDictionary englishDictionary; private static NGramBloomFilter nGramBloomFilter; private static final LanguageModels lm = TestLanguageModels.getLanguageModels(); - @BeforeEach - public void setUp() throws IOException { + @BeforeAll + public static void setUp() throws IOException { dict = new TermFrequencyDict(lm); nGramBloomFilter = new NGramBloomFilter(lm); englishDictionary = new EnglishDictionary(dict); @@ -28,8 +30,35 @@ class QueryParserTest { @Test public void testAdviceString() { - System.out.println(parser.parse("alcibiades (plato) \"my query\" -cars")); - System.out.println(parser.parse("universals plato")); + var ret = parser.parse("alcibiades (plato) \"my query\" -cars"); + assertEquals(4, ret.size()); + + var alcibiades = ret.get(0); + assertEquals(TokenType.LITERAL_TERM, alcibiades.type); + assertEquals("alcibiades", alcibiades.str); + assertEquals("alcibiades", alcibiades.displayStr); + + var plato = ret.get(1); + assertEquals(TokenType.ADVICE_TERM, plato.type); + assertEquals("plato", plato.str); + assertEquals("(plato)", plato.displayStr); + + var my_query = ret.get(2); + assertEquals(TokenType.QUOT_TERM, my_query.type); + assertEquals("my_query", my_query.str); + assertEquals("\"my query\"", my_query.displayStr); + + var not_cars = ret.get(3); + assertEquals(TokenType.EXCLUDE_TERM, not_cars.type); + assertEquals("cars", not_cars.str); + assertEquals("-cars", not_cars.displayStr); + } + + @Test + public void testParseYear() { + System.out.println(parser.parse("year>2000")); + System.out.println(parser.parse("year=2000")); + System.out.println(parser.parse("year<2000")); } @Test diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index 706986b9..f6d74999 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -5,18 +5,18 @@ import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; -import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; class QueryVariantsTest { - QueryVariants variants; - QueryParser parser; - SentenceExtractor se; + static QueryVariants variants; + static QueryParser parser; + static SentenceExtractor se; - @BeforeEach - public void setUp() throws IOException { + @BeforeAll + public static void setUp() throws IOException { LanguageModels lm = TestLanguageModels.getLanguageModels(); se = new SentenceExtractor(lm); @@ -33,6 +33,13 @@ class QueryVariantsTest { testCase("Omelet recipe"); } + @Test + void queryNegation() { + System.out.println(se.extractSentence("salt lake -city")); + testCase("salt lake -city"); + } + + @Test void getQueryVariants() { System.out.println(se.extractSentence("we are alone")); @@ -60,7 +67,7 @@ class QueryVariantsTest { } private void testCase(String input) { - var tokens = variants.getQueryVariants(parser.extractBasicTokens(input)); + var tokens = variants.getQueryVariants(parser.parse(input)); System.out.println(tokens); } } \ No newline at end of file diff --git a/protocol/def/index.proto b/protocol/def/index.proto index 53c82cf6..2a774819 100644 --- a/protocol/def/index.proto +++ b/protocol/def/index.proto @@ -8,7 +8,8 @@ message IndexPutKeywordsReq { int32 domain = 1; int32 url = 2; int32 index = 3; - repeated WordSet wordSet = 4; + int64 metadata = 4; + repeated WordSet wordSet = 5; message WordSet { int32 index = 1;