From 88908c203db0913865facb5c7fc873d533cb4c82 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 15 Jun 2022 16:34:03 +0200 Subject: [PATCH] Refactoring conversion --- .../wmsa/edge/EdgeSearchE2ETest.java | 23 +++- marginalia_nu/src/e2e/resources/init.sh | 2 +- .../nu/marginalia/util/btree/BTreeWriter.java | 16 ++- .../marginalia/util/btree/WriteCallback.java | 4 +- .../util/btree/model/BTreeHeader.java | 3 +- .../marginalia/util/hash/LongPairHashMap.java | 47 ++++---- .../util/multimap/MultimapFileLong.java | 11 +- .../multimap/MultimapFileLongOffsetSlice.java | 70 ++++++++++++ .../util/multimap/MultimapFileLongSlice.java | 29 +++++ .../util/multimap/MultimapSearcher.java | 4 +- .../util/multimap/MultimapSorter.java | 4 +- .../loader/SqlLoadProcessedDocument.java | 2 +- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 12 +- .../index/{radix => }/EdgeIndexBucket.java | 10 +- .../wmsa/edge/index/EdgeIndexControl.java | 9 +- .../wmsa/edge/index/EdgeIndexService.java | 6 +- .../wmsa/edge/index/IndexServicesFactory.java | 12 +- .../ConversionUnnecessaryException.java | 2 +- .../SearchEngineRanking.java | 2 +- .../SearchIndexConverter.java | 108 +++++++----------- .../SearchIndexDao.java | 2 +- .../SearchIndexPartitioner.java | 4 +- .../SearchIndexPreconverter.java | 3 +- .../words/WordIndexLengthsTable.java | 10 ++ .../words/WordIndexOffsetsTable.java | 67 +++++++++++ .../conversion/words/WordIndexTables.java | 56 +++++++++ .../conversion/words/WordsTableWriter.java | 75 ++++++++++++ .../index => journal}/SearchIndexWriter.java | 2 +- .../SearchIndexWriterImpl.java | 2 +- .../IndexWordsTable.java} | 90 +++++++++------ .../index => reader}/SearchIndex.java | 6 +- .../index => reader}/SearchIndexReader.java | 10 +- .../{service => reader}/SearchIndexes.java | 8 +- .../query/IndexQueryBuilder.java | 4 +- .../query/IndexSearchBudget.java | 2 +- .../{service => reader}/query/Query.java | 2 +- .../wmsa/edge/index/service/SearchOrder.java | 6 - .../index/wordstable/IndexWordsTable.java | 48 -------- .../index/wordstable/WordsTableWriter.java | 85 -------------- .../model/search/EdgeSearchSpecification.java | 4 +- .../wmsa/edge/search/EdgeSearchOperator.java | 3 +- .../wmsa/edge/search/EdgeSearchProfile.java | 17 ++- .../command/commands/SiteSearchCommand.java | 4 +- .../edge/search/model/DomainInformation.java | 1 - .../wmsa/edge/search/query/QueryFactory.java | 1 - .../siteinfo/DomainInformationService.java | 3 +- .../wmsa/edge/tools/IndexMergerMain.java | 4 +- .../templates/edge/site-info-gmi.hdb | 1 - .../resources/templates/edge/site-info.hdb | 1 - .../util/btree/BTreeWriterTest.java | 26 ++--- .../util/hash/LongPairHashMapTest.java | 4 +- .../index/service/DictionaryWriterTest.java | 4 +- .../index/service/EdgeIndexClientTest.java | 6 +- .../index/service/SearchIndexWriterTest.java | 12 +- .../edge/search/query/QueryVariantsTest.java | 5 +- 55 files changed, 574 insertions(+), 380 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{radix => }/EdgeIndexBucket.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => conversion}/ConversionUnnecessaryException.java (80%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => conversion}/SearchEngineRanking.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => conversion}/SearchIndexConverter.java (75%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => conversion}/SearchIndexDao.java (98%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/query => conversion}/SearchIndexPartitioner.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => conversion}/SearchIndexPreconverter.java (97%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => journal}/SearchIndexWriter.java (88%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => journal}/SearchIndexWriterImpl.java (98%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index/wordstable/BtreeWordsTable.java => reader/IndexWordsTable.java} (58%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => reader}/SearchIndex.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service/index => reader}/SearchIndexReader.java (96%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/SearchIndexes.java (93%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/query/IndexQueryBuilder.java (97%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/query/IndexSearchBudget.java (87%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{service => reader}/query/Query.java (73%) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index af43e462..08408de2 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -28,6 +28,7 @@ import java.util.ArrayList; import java.util.List; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; +import static org.junit.jupiter.api.Assertions.assertEquals; @Tag("e2e") @Testcontainers @@ -156,6 +157,16 @@ public class EdgeSearchE2ETest extends E2ETestBase { return wikipediaFiles.toString(); } + private List getTitlesFromSearchResults(String html) { + List ret = new ArrayList<>(); + + for (var title : Jsoup.parse(html).select(".card.search-result > h2")) { + ret.add(title.text()); + } + + return ret; + } + @Test public void testFrontPage() throws IOException { var driver = chrome.getWebDriver(); @@ -173,8 +184,9 @@ public class EdgeSearchE2ETest extends E2ETestBase { driver.get("http://proxyNginx/search?query=bird&profile=corpo"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); + assertEquals(List.of("Bird"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query")); } @@ -187,20 +199,23 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info")); } + @Test public void testSiteSearch() throws IOException { var driver = chrome.getWebDriver(); driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"); + + assertEquals(List.of("Frog", "Binomial nomenclature", "Amphibian", "Mantis"), getTitlesFromSearchResults(html)); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search")); } + @Test public void testBrowse() throws IOException { var driver = chrome.getWebDriver(); @@ -209,7 +224,6 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse")); } @Test @@ -220,7 +234,6 @@ public class EdgeSearchE2ETest extends E2ETestBase { System.out.println(driver.getTitle()); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); - Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define")); } @Test diff --git a/marginalia_nu/src/e2e/resources/init.sh b/marginalia_nu/src/e2e/resources/init.sh index 5409f787..50dbd406 100644 --- a/marginalia_nu/src/e2e/resources/init.sh +++ b/marginalia_nu/src/e2e/resources/init.sh @@ -69,4 +69,4 @@ memex memex dating dating EOF -WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file +WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java index 28ac4914..b43faca7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java @@ -3,6 +3,7 @@ package nu.marginalia.util.btree; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -12,9 +13,9 @@ import java.io.IOException; public class BTreeWriter { private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class); private final BTreeContext ctx; - private final MultimapFileLong map; + private final MultimapFileLongSlice map; - public BTreeWriter(MultimapFileLong map, BTreeContext ctx) { + public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) { this.map = map; this.ctx = ctx; } @@ -31,13 +32,18 @@ public class BTreeWriter { return size; } - public long write(long offset, int numEntries, WriteCallback writeIndex) + /** Construct a BTree with numEntries entries at offset in the associated map + * + * @return The size of the written data + */ + public long write(long offset, int numEntries, WriteCallback writeIndexCallback) throws IOException { - var header = makeHeader(offset, numEntries); + BTreeHeader header = makeHeader(offset, numEntries); header.write(map, offset); - writeIndex.write(header.dataOffsetLongs()); + + writeIndexCallback.write(map.atOffset(header.dataOffsetLongs())); if (header.layers() < 1) { return ctx.calculateSize(numEntries); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java index 70bd8132..a6225db1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/WriteCallback.java @@ -1,7 +1,9 @@ package nu.marginalia.util.btree; +import nu.marginalia.util.multimap.MultimapFileLongSlice; + import java.io.IOException; public interface WriteCallback { - void write(long offset) throws IOException; + void write(MultimapFileLongSlice slice) throws IOException; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java index 4951f5b8..8d68b424 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java @@ -1,6 +1,7 @@ package nu.marginalia.util.btree.model; import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) { public BTreeHeader { @@ -28,7 +29,7 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon return padding; } - public void write(MultimapFileLong dest, long offset) { + public void write(MultimapFileLongSlice dest, long offset) { dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL)); dest.put(offset+1, indexOffsetLongs); dest.put(offset+2, dataOffsetLongs); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java index 6f8912a9..d1e056b9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/hash/LongPairHashMap.java @@ -1,9 +1,7 @@ package nu.marginalia.util.hash; -import io.prometheus.client.Gauge; import lombok.EqualsAndHashCode; import lombok.Getter; -import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.PrimeUtil; import org.slf4j.Logger; @@ -17,9 +15,7 @@ import static java.lang.Math.round; */ public class LongPairHashMap { private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class); - private static final Gauge probe_count_metrics - = Gauge.build("wmsa_wordfile_hash_map_probe_count", "Probing Count") - .register(); + private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police private final long hashTableSize; private final MultimapFileLong data; @@ -27,26 +23,37 @@ public class LongPairHashMap { private int sz = 0; private static final int HEADER_SIZE = 2; - public LongPairHashMap(MultimapFileLong data, long size) { + private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) { this.data = data; - // Actually use a prime size for Donald Knuth reasons - hashTableSize = PrimeUtil.nextPrime(size, 1); - maxProbeLength = hashTableSize / 2; + this.hashTableSize = hashTableSize; + this.maxProbeLength = maxProbeLength; + } - logger.debug("Table size = " + hashTableSize); + public static LongPairHashMap createNew(MultimapFileLong data, long size) { + var tableSize = PrimeUtil.nextPrime(size, 1); + var ret = new LongPairHashMap(data, tableSize, tableSize/2); - data.put(0, IndexWordsTable.Strategy.HASH.ordinal()); - data.put(1, hashTableSize); - for (int i = 2; i < hashTableSize; i++) { + data.put(0, MAGIC_WORD); + data.put(1, tableSize); + + for (int i = 2; i < tableSize; i++) { data.put(HEADER_SIZE + 2L*i, 0); } - } - public LongPairHashMap(MultimapFileLong data) { - this.data = data; - hashTableSize = data.get(1); - maxProbeLength = hashTableSize / 10; - logger.debug("Table size = " + hashTableSize); + return ret; + } + + public static LongPairHashMap loadExisting(MultimapFileLong data) { + long key = data.get(0); + + if (key != MAGIC_WORD) { + logger.warn("LongPairHashMap lacks magic word, could this be garbage data?"); + } + + var hashTableSize = data.get(1); + var maxProbeLength = hashTableSize / 10; + + return new LongPairHashMap(data, hashTableSize, maxProbeLength); } public int size() { @@ -91,8 +98,6 @@ public class LongPairHashMap { final var val = getCell(idx); if (!val.isSet()) { - probe_count_metrics.set(j); - return setValue(data, idx); } else if (val.getKey() == data.getKey()) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java index dca8248e..f381a977 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java @@ -21,7 +21,7 @@ import static java.nio.channels.FileChannel.MapMode.READ_WRITE; import static nu.marginalia.util.FileSizeUtil.readableSize; -public class MultimapFileLong implements AutoCloseable { +public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { private final ArrayList buffers = new ArrayList<>(); private final ArrayList mappedByteBuffers = new ArrayList<>(); @@ -196,10 +196,12 @@ public class MultimapFileLong implements AutoCloseable { } } + @Override public long size() { return fileLength; } + @Override public void put(long idx, long val) { if (idx >= mappedSize) grow(idx); @@ -214,6 +216,7 @@ public class MultimapFileLong implements AutoCloseable { } } + @Override public long get(long idx) { if (idx >= mappedSize) grow(idx); @@ -229,10 +232,12 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void read(long[] vals, long idx) { read(vals, vals.length, idx); } + @Override public void read(long[] vals, int n, long idx) { if (idx+n >= mappedSize) { grow(idx+n); @@ -257,10 +262,12 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void write(long[] vals, long idx) { write(vals, vals.length, idx); } + @Override public void write(long[] vals, int n, long idx) { if (idx+n >= mappedSize) { grow(idx+n); @@ -285,6 +292,7 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void write(LongBuffer vals, long idx) { int n = vals.limit() - vals.position(); if (idx+n >= mappedSize) { @@ -310,6 +318,7 @@ public class MultimapFileLong implements AutoCloseable { } + @Override public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException { int length = (int)(sourceEnd - sourceStart); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java new file mode 100644 index 00000000..c2630ddc --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java @@ -0,0 +1,70 @@ +package nu.marginalia.util.multimap; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; + +public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice { + private final long off; + private final MultimapFileLongSlice map; + + public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) { + this.off = off; + this.map = map; + } + + @Override + public long size() { + return map.size() - off; + } + + @Override + public void put(long idx, long val) { + map.put(off+idx, val); + } + + @Override + public long get(long idx) { + return map.get(off+idx); + } + + @Override + public void read(long[] vals, long idx) { + map.read(vals, idx+off); + } + + @Override + public void read(long[] vals, int n, long idx) { + map.read(vals, n, idx+off); + } + + @Override + public void write(long[] vals, long idx) { + map.write(vals, idx+off); + } + + @Override + public void write(long[] vals, int n, long idx) { + map.write(vals, n, idx+off); + } + + @Override + public void write(LongBuffer vals, long idx) { + map.write(vals, idx+off); + } + + @Override + public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) + throws IOException { + map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd); + } + + @Override + public MultimapFileLongSlice atOffset(long off) { + // If we don't override this, the default implementation would build a pyramid of + // MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...))) + // if this is called iteratively (e.g. to walk over a file) + + return new MultimapFileLongOffsetSlice(map, this.off + off); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java new file mode 100644 index 00000000..abf29f51 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java @@ -0,0 +1,29 @@ +package nu.marginalia.util.multimap; + +import java.io.IOException; +import java.nio.LongBuffer; +import java.nio.channels.FileChannel; + +public interface MultimapFileLongSlice { + long size(); + + void put(long idx, long val); + + long get(long idx); + + void read(long[] vals, long idx); + + void read(long[] vals, int n, long idx); + + void write(long[] vals, long idx); + + void write(long[] vals, int n, long idx); + + void write(LongBuffer vals, long idx); + + void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException; + + default MultimapFileLongSlice atOffset(long off) { + return new MultimapFileLongOffsetSlice(this, off); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java index c961ac0e..005888d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java @@ -4,9 +4,9 @@ import lombok.experimental.Delegate; public class MultimapSearcher { @Delegate - private final MultimapFileLong mmf; + private final MultimapFileLongSlice mmf; - public MultimapSearcher(MultimapFileLong mmf) { + public MultimapSearcher(MultimapFileLongSlice mmf) { this.mmf = mmf; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java index 6ca4f64f..61dd04c4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java @@ -13,10 +13,10 @@ import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE; public class MultimapSorter { private final Path tmpFileDir; private final int internalSortLimit; - private final MultimapFileLong multimapFileLong; + private final MultimapFileLongSlice multimapFileLong; private final long[] buffer; - public MultimapSorter(MultimapFileLong multimapFileLong, Path tmpFileDir, int internalSortLimit) { + public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) { this.multimapFileLong = multimapFileLong; this.tmpFileDir = tmpFileDir; this.internalSortLimit = internalSortLimit; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index 85b6c3fe..e2e25fff 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -48,7 +48,7 @@ public class SqlLoadProcessedDocument { IN STATE VARCHAR(32)) BEGIN UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; - DELETE FROM PAGE_DATA WHERE ID=URL_ID; + DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID; END """); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index 233ffd3a..30ea2256 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -135,7 +135,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { final Set domains = new HashSet<>(count*3); final String q = """ - SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART, COUNT(*) AS CNT + SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT FROM EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID @@ -169,7 +169,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { if (domains.size() < count/2) { final String q2 = """ - SELECT EC_DOMAIN.ID, URL_PART + SELECT EC_DOMAIN.ID, DOMAIN_NAME FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID @@ -199,11 +199,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { if (domains.size() < count/2) { final String q3 = """ - SELECT EC_DOMAIN.ID, URL_PART - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + SELECT EC_DOMAIN.ID, DOMAIN_NAME + FROM EC_DOMAIN + INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID - INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID + INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE B.DEST_DOMAIN_ID=? AND STATE<2 AND KNOWN_URLS<1000 diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java index 2e8fdcd2..05bcfe75 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/radix/EdgeIndexBucket.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.radix; +package nu.marginalia.wmsa.edge.index; import nu.marginalia.wmsa.edge.index.EdgeIndexControl; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriter; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.service.query.Query; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriter; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.reader.query.Query; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index b590af55..ab7c73fe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.Inject; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException; +import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; + +import java.io.IOException; public class EdgeIndexControl { @@ -27,7 +29,10 @@ public class EdgeIndexControl { System.gc(); } catch (ConversionUnnecessaryException unnecessary) { - + // swallow quietly + } + catch (IOException e) { + e.printStackTrace(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index a04a4c83..de6276a8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -15,9 +15,9 @@ import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.edge.index.model.*; -import nu.marginalia.wmsa.edge.index.service.SearchIndexes; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index 4d0c18e9..61e64b41 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -5,12 +5,16 @@ import com.google.inject.Singleton; import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.*; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -89,7 +93,7 @@ public class IndexServicesFactory { } - public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException { + public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { return new SearchIndexConverter(block, id, tmpFileDir, preconverterOutputFile.get(id), indexWriteWordsFile.get(id, block.id), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java similarity index 80% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java index fd7f529f..2242f476 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/ConversionUnnecessaryException.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.conversion; public class ConversionUnnecessaryException extends Exception { public ConversionUnnecessaryException() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java index abaced82..220a9708 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchEngineRanking.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchEngineRanking.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.wmsa.edge.index.conversion; import gnu.trove.list.TIntList; import gnu.trove.map.hash.TIntIntHashMap; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java index c9b69386..0827b4e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import com.google.inject.name.Named; @@ -6,9 +6,10 @@ import gnu.trove.set.hash.TIntHashSet; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; +import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter; import nu.marginalia.util.btree.BTreeWriter; import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.multimap.MultimapFileLong; @@ -32,18 +33,24 @@ public class SearchIndexConverter { private final long fileLength; private final long urlsFileSize; + private final Path tmpFileDir; + private final FileChannel urlsTmpFileChannel; private final int wordCount; private final MultimapFileLong urlsTmpFileMap; private final Logger logger = LoggerFactory.getLogger(getClass()); private final IndexBlock block; private final int bucketId; - @org.jetbrains.annotations.NotNull + + private final File urlsFile; private final SearchIndexPartitioner partitioner; private final TIntHashSet spamDomains; private final MultimapSorter urlTmpFileSorter; + private final static int internalSortLimit = + Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256; + @SneakyThrows public static long wordCount(File inputFile) { try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) { @@ -52,7 +59,6 @@ public class SearchIndexConverter { } } - @SneakyThrows @Inject public SearchIndexConverter(IndexBlock block, int bucketId, @Named("tmp-file-dir") Path tmpFileDir, @@ -61,13 +67,15 @@ public class SearchIndexConverter { @Named("edge-index-write-urls-file") File outputFileUrls, SearchIndexPartitioner partitioner, EdgeDomainBlacklist blacklist) - throws ConversionUnnecessaryException + throws ConversionUnnecessaryException, IOException { this.block = block; this.bucketId = bucketId; - urlsFile = outputFileUrls; + this.tmpFileDir = tmpFileDir; + this.urlsFile = outputFileUrls; this.partitioner = partitioner; this.spamDomains = blacklist.getSpamDomains(); + logger.info("Converting {} ({}) {}", block.id, block, inputFile); Files.deleteIfExists(outputFileWords.toPath()); @@ -89,18 +97,16 @@ public class SearchIndexConverter { urlsFileSize = getUrlsSize(buffer, inputChannel); var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - - var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); urlsTmpFileChannel = urlsTmpFileRaf.getChannel(); urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false); - urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256); + urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block); - long[] wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); + WordIndexOffsetsTable wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block); - createUrlTable(tmpFileDir, buffer, raf, wordIndexTable); + createUrlTable(buffer, raf, wordIndexTable); Files.delete(tmpUrlsFile); raf.close(); @@ -140,99 +146,69 @@ public class SearchIndexConverter { return reader.size; } - private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException { - logger.debug("Table size = {}", wordIndexTable.length); - int[] wordIndex = new int[wordIndexTable.length]; + private void createUrlTable(ByteBuffer buffer, RandomAccessFile raf, WordIndexOffsetsTable wordOffsetsTable) throws IOException { + logger.info("Table size = {}", wordOffsetsTable.length()); + raf.seek(FILE_HEADER_SIZE); var channel = raf.getChannel(); try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) { - var reader = new IndexReader(buffer, channel) { + int[] wordWriteOffset = new int[wordOffsetsTable.length()]; + + new IndexReader(buffer, channel) { @Override public void eachWord(long urlId, int wordId) throws IOException { - if (wordId >= wordIndex.length) + if (wordId >= wordWriteOffset.length) return; - if (wordId != 0) { - if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) { - logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}", - wordId, - wordIndex[wordId], - wordIndexTable[wordId - 1], - wordIndexTable[wordId]); - throw new IllegalStateException(); - } - } if (wordId > 0) { - rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId)); + rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, translateUrl(urlId)); } else { - rwf.put(wordIndex[wordId]++, translateUrl(urlId)); + rwf.put(wordWriteOffset[wordId]++, translateUrl(urlId)); } } - }; - - reader.read(); + }.read(); rwf.write(urlsTmpFileChannel); } urlsTmpFileChannel.force(false); + logger.info("URL TMP Table: {} Mb", channel.position()/(1024*1024)); - logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024)); + if (wordOffsetsTable.length() > 0) { + logger.info("Sorting urls table"); + + wordOffsetsTable.forEach(urlTmpFileSorter::sort); - if (wordIndexTable.length > 0) { - logger.debug("Sorting urls table"); - sortUrls(wordIndexTable); urlsTmpFileMap.force(); } else { logger.warn("urls table empty -- nothing to sort"); } - - long idx = 0; - + logger.info("Writing BTree"); try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) { var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); - if (wordIndexTable[0] != 0) { - int start = 0; - int end = (int) wordIndexTable[0]; + wordOffsetsTable.fold((accumulatorIdx, start, length) -> { + // Note: The return value is accumulated into accumulatorIdx! - idx += writer.write(idx, (int) wordIndexTable[0], - offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); - } + return writer.write(accumulatorIdx, length, + slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length)); + }); - for (int i = 1; i < wordIndexTable.length; i++) { - if (wordIndexTable[i] != wordIndexTable[i - 1]) { - long start = wordIndexTable[i-1]; - long end = wordIndexTable[i]; - - idx += writer.write(idx, (int) (end-start), - offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); - } - } } catch (Exception e) { - e.printStackTrace(); + logger.error("Error while writing BTree", e); } } - @SneakyThrows - private void sortUrls(long[] wordIndices) { - urlTmpFileSorter.sort( 0, (int) wordIndices[0]); - - for (int i = 1; i < wordIndices.length; i++) { - urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1])); - } - } - - private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception { + private WordIndexOffsetsTable createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws IOException { inputChannel.position(FILE_HEADER_SIZE); logger.debug("Table size = {}", wordCount); WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount); - ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE); + ByteBuffer buffer = ByteBuffer.allocateDirect(8* SearchIndexWriterImpl.MAX_BLOCK_SIZE); logger.debug("Reading words"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java index a12b249e..fcf6d175 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java index cf281116..bf5a1d74 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/SearchIndexPartitioner.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java @@ -1,11 +1,9 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import com.google.inject.Singleton; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.index.service.SearchEngineRanking; -import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java index 5149b546..9e851025 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexPreconverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java @@ -1,10 +1,9 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.conversion; import com.google.inject.Inject; import gnu.trove.set.hash.TIntHashSet; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java new file mode 100644 index 00000000..464e9388 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexLengthsTable.java @@ -0,0 +1,10 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +public class WordIndexLengthsTable { + final long[] table; + + public WordIndexLengthsTable(int size) { + this.table = new long[size]; + } + public void increment(int idx) { table[idx]++; } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java new file mode 100644 index 00000000..29b88509 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java @@ -0,0 +1,67 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +import java.io.IOException; + +public class WordIndexOffsetsTable { + final long[] table; + public final int numberOfUsedWords; + + public WordIndexOffsetsTable(long[] table, int numberOfUsedWords) { + + this.table = table; + this.numberOfUsedWords = numberOfUsedWords; + } + + public int length() { + return table.length; + } + + public void forEach(OffsetTableEntryConsumer o) throws IOException { + if (table[0] > 0) { + o.accept(0, (int) table[0]); + } + + for (int i = 1; i < table.length; i++) { + long start = table[i-1]; + int length = (int) (table[i] - start); + + if (length != 0) { + o.accept(start, length); + } + } + } + + /** + * Fold over each span in the file, left to right + */ + public long fold(OffsetTableEntryFoldConsumer o) throws IOException { + long total = 0; + + if (table[0] > 0) { + total = o.accept(total,0, (int) table[0]); + } + + for (int i = 1; i < table.length; i++) { + long start = table[i-1]; + int length = (int) (table[i] - start); + + if (length != 0) { + total += o.accept(total, start, length); + } + } + + return total; + } + + public long get(int i) { + return table[i]; + } + + public interface OffsetTableEntryConsumer { + void accept(long start, int length) throws IOException; + } + + public interface OffsetTableEntryFoldConsumer { + long accept(long accumulator, long start, int length) throws IOException; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java new file mode 100644 index 00000000..2056948b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexTables.java @@ -0,0 +1,56 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +/** Contains a stateful table of word index offsets, initially in lengths mode + * where the table contains how many postings exist for each word; then in offsets + * mode, where the lengths are converted into the necessary offsets for each block + * of document data. + * + * Caveat! This uses the same underlying array to conserve space. + * + */ +public class WordIndexTables { + private WordIndexLengthsTable lengthsTable; + private WordIndexOffsetsTable offsetsTable; + + private boolean converted = false; + + public WordIndexTables(int size) { + lengthsTable = new WordIndexLengthsTable(size); + } + + public WordIndexLengthsTable lengths() { + if (converted) throw new IllegalStateException("Table has been converted"); + + return lengthsTable; + } + + public WordIndexOffsetsTable offsets() { + if (!converted) throw new IllegalStateException("Table has not been converted"); + + return offsetsTable; + } + + public void convert() { + if (converted) throw new IllegalStateException("Table has been converted"); + + // Go from lengths to offsets, i.e. + // BEFORE: 1, 2, 1, 3, 0, 2 + // AFTER: 1, 3, 4, 7, 7, 9 + + long[] table = lengthsTable.table; + int numberOfUsedWords = 0; + + if (table[0] != 0) numberOfUsedWords = 1; + + for (int i = 1; i < table.length; i++) { + if (table[i] != 0) { + numberOfUsedWords++; + } + table[i] += table[i-1]; + } + + lengthsTable = null; + offsetsTable = new WordIndexOffsetsTable(table, numberOfUsedWords); + converted = true; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java new file mode 100644 index 00000000..7f762ff3 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java @@ -0,0 +1,75 @@ +package nu.marginalia.wmsa.edge.index.conversion.words; + +import nu.marginalia.util.btree.BTreeWriter; +import nu.marginalia.util.btree.model.BTreeContext; +import nu.marginalia.util.multimap.MultimapFileLong; +import nu.marginalia.util.multimap.MultimapFileLongSlice; +import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; + +import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext; + +public class WordsTableWriter { + private final WordIndexTables table; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8); + + public WordsTableWriter(int length) { + table = new WordIndexTables(length); + } + + public void acceptWord(int wordId) { + table.lengths().increment(wordId); + } + + public WordIndexOffsetsTable getTable() { + return table.offsets(); + } + + public void write(File file) throws IOException { + table.convert(); + + logger.info("Writing table - {} max", table.offsets().numberOfUsedWords); + + final int tableSize = table.offsets().numberOfUsedWords; + + try (var mmf = MultimapFileLong.forOutput(file.toPath(), tableSize/8L)) { + mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal()); + long offset = 1; + + var writer = new BTreeWriter(mmf, wordsBTreeContext); + + writer.write(offset, tableSize, this::writeBTreeBlock); + } + } + + private void writeBTreeBlock(MultimapFileLongSlice mapSlice) { + long urlFileOffset = 0; + int idx = 0; + + var offsetTable = table.offsets().table; + + if (offsetTable[0] != 0) { + int length = (int) offsetTable[0]; + mapSlice.put(idx++, (long)length<<32); + mapSlice.put(idx++, 0); + + urlFileOffset += (urlsBTreeContext.calculateSize(length)); + } + + for (int i = 1; i < offsetTable.length; i++) { + final int length = (int)(offsetTable[i] - offsetTable[i-1]); + + if (length > 0) { + mapSlice.put(idx++, (long)length << 32 | i); + mapSlice.put(idx++, urlFileOffset); + + urlFileOffset += (urlsBTreeContext.calculateSize(length)); + } + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java index ca5d70b3..11fc186a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.journal; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeDomain; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java index d434042d..cf76ada2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.journal; import io.reactivex.rxjava3.disposables.Disposable; import io.reactivex.rxjava3.schedulers.Schedulers; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java similarity index 58% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java index 0a6a70c0..2bde1aa7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/BtreeWordsTable.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java @@ -1,36 +1,80 @@ -package nu.marginalia.wmsa.edge.index.service.index.wordstable; +package nu.marginalia.wmsa.edge.index.reader; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.channels.FileChannel; import java.util.function.LongConsumer; -import static nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter.wordsBTreeContext; +import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wordsBTreeContext; -public class BtreeWordsTable extends IndexWordsTable{ - private final MultimapFileLong words; - private final BTreeReader reader; - private final BTreeHeader header; - private final int HEADER_OFFSET = 1; +public class IndexWordsTable implements AutoCloseable { + protected final MultimapFileLong words; + protected final BTreeReader reader; + protected final BTreeHeader header; + protected final int HEADER_OFFSET = 1; + final Logger logger = LoggerFactory.getLogger(getClass()); - public BtreeWordsTable(MultimapFileLong words) { + private static final int BUFFER_SIZE = 1024*1024*64; + + public IndexWordsTable(MultimapFileLong words) { this.words = words; - reader = new BTreeReader(words, wordsBTreeContext); header = reader.getHeader(HEADER_OFFSET); madvise(); } - private void madvise() { + public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException { + var wordsFile = openWordsFile(file); + long signature = wordsFile.get(0); + + if (signature == Strategy.BTREE.ordinal()) { + return new IndexWordsTable(wordsFile); + } + + throw new IllegalArgumentException("Unknown signature " + signature); + } + + private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { + return new MultimapFileLong(wordsFile, + FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false); + } + + public long positionForWord(int wordId) { + + long offset = reader.offsetForEntry(header, wordId); + if (offset < 0) { + return -1L; + } + + return words.get(offset+1); + } + + public int wordLength(int wordId) { + + long offset = reader.offsetForEntry(header, wordId); + if (offset < 0) { + return -1; + } + + return (int)(words.get(offset) >> 32); + } + + protected void madvise() { words.advice(NativeIO.Advice.Random); words.advice0(NativeIO.Advice.WillNeed); var h = reader.getHeader(HEADER_OFFSET); int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs()); + words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); words.pokeRange(h.indexOffsetLongs(), length); } @@ -58,31 +102,13 @@ public class BtreeWordsTable extends IndexWordsTable{ } } - @Override - public long positionForWord(int wordId) { - - long offset = reader.offsetForEntry(header, wordId); - if (offset < 0) { - return -1L; - } - - return words.get(offset+1); - } - - @Override - public int wordLength(int wordId) { - - long offset = reader.offsetForEntry(header, wordId); - if (offset < 0) { - return -1; - } - - return (int)(words.get(offset) >> 32); - } - @Override public void close() throws Exception { words.close(); } + public enum Strategy { + BTREE + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java index 17e62437..042f8f54 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java @@ -1,20 +1,18 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.reader; import com.google.inject.Inject; import com.google.inject.name.Named; import com.upserve.uppend.blobs.NativeIO; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.multimap.MultimapFileLong; -import org.eclipse.jetty.util.thread.ThreadPool; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; -import java.util.concurrent.ForkJoinPool; import java.util.stream.LongStream; public class SearchIndex implements AutoCloseable { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 7baeb8ae..8e7fea81 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.service.index; +package nu.marginalia.wmsa.edge.index.reader; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.query.IndexQueryBuilder; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.service.query.Query; +import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.reader.query.Query; import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -105,10 +105,8 @@ public class SearchIndexReader implements AutoCloseable { .mapToLong(idx -> idx.numUrls(word)) .sum() ); - } - public IndexBlock getBlockForResult(int searchTerm, long urlId) { for (var block : indicesBySearchOrder) { var index = indices.get(block); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java index dea842c6..863c0c65 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchIndexes.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.service; +package nu.marginalia.wmsa.edge.index.reader; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; -import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java index be217057..6f54dd2d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexQueryBuilder.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.reader.query; import com.google.common.collect.Streams; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; import java.util.Collection; import java.util.List; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java index 2ec30e65..3608f70a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/IndexSearchBudget.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.reader.query; public class IndexSearchBudget { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java similarity index 73% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java index 09f7701b..5f343d54 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/query/Query.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.service.query; +package nu.marginalia.wmsa.edge.index.reader.query; import java.util.stream.LongStream; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java deleted file mode 100644 index d1c9f10a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/SearchOrder.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service; - -public enum SearchOrder { - ASCENDING, - REVERSED -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java deleted file mode 100644 index 5b557db1..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/IndexWordsTable.java +++ /dev/null @@ -1,48 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service.index.wordstable; - -import nu.marginalia.util.multimap.MultimapFileLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; -import java.util.function.LongConsumer; - -public abstract class IndexWordsTable implements AutoCloseable { - final Logger logger = LoggerFactory.getLogger(getClass()); - - private static final int BUFFER_SIZE = 1024*1024*64; - - public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException { - var wordsFile = openWordsFile(file); - long signature = wordsFile.get(0); - - if (signature == Strategy.BTREE.ordinal()) { - return new BtreeWordsTable(wordsFile); - } - throw new IllegalArgumentException("Unknown signature " + signature); - } - - private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException { - return new MultimapFileLong(wordsFile, - FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false); - } - - public abstract long positionForWord(int wordId); - - public abstract int wordLength(int wordId); - public abstract void forEachWordsOffset(LongConsumer offsetConsumer); - - @Override - public void close() throws Exception { - - } - - public record TableWordRange(long start, long end) {} - - public enum Strategy { - FLAT, HASH, BTREE_OLD, BTREE - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java deleted file mode 100644 index 3097dd47..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/wordstable/WordsTableWriter.java +++ /dev/null @@ -1,85 +0,0 @@ -package nu.marginalia.wmsa.edge.index.service.index.wordstable; - -import nu.marginalia.util.btree.BTreeWriter; -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.multimap.MultimapFileLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; - -import static nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter.urlsBTreeContext; - -public class WordsTableWriter { - private final long[] table; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8); - - public WordsTableWriter(int length) { - table = new long[length]; - } - - public void acceptWord(int wordId) { - if (wordId >= table.length) { - logger.warn("Invalid word-id {}", wordId); - } - else { - table[wordId]++; - } - } - - public long[] getTable() { - return table; - } - public void write(File file) throws Exception { - - int tableSize = 0; - - if (table[0] != 0) tableSize = 1; - - for (int i = 1; i < table.length; i++) { - if (table[i] != 0) { - tableSize++; - } - table[i] += table[i-1]; - } - - logger.info("Writing table {} words {} max", tableSize, table.length); - - writeBtreeWordsFile(file, table, tableSize); - - } - - private void writeBtreeWordsFile(File outputFileWords, long[] table, int tableSize) throws Exception { - try (var mmf = MultimapFileLong.forOutput(outputFileWords.toPath(), tableSize/8L)) { - mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal()); - long offset = 1; - - var writer = new BTreeWriter(mmf, wordsBTreeContext); - - writer.write(offset, tableSize, (idx) -> { - long urlFileOffset = 0; - - if (table[0] != 0) { - int length = (int) table[0]; - mmf.put(idx++, (long)length<<32); - mmf.put(idx++, 0); - - urlFileOffset += (urlsBTreeContext.calculateSize(length)); - } - - for (int i = 1; i < table.length; i++) { - if (table[i] != table[i - 1]) { - int length = (int)(table[i] - table[i-1]); - mmf.put(idx++, (long)length << 32 | i); - mmf.put(idx++, urlFileOffset); - - urlFileOffset += (urlsBTreeContext.calculateSize(length)); - } - } - }); - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java index 0063efd9..02c7197a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java @@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.*; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.SearchOrder; import java.util.Arrays; import java.util.Collections; @@ -21,14 +20,13 @@ public class EdgeSearchSpecification { public final int limitTotal; public final String humanQuery; - public final SearchOrder searchOrder; public boolean stagger; public boolean experimental; public static EdgeSearchSpecification justIncludes(String... words) { return new EdgeSearchSpecification( IntStream.range(0, DYNAMIC_BUCKET_LENGTH+1).boxed().toList(), - Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", SearchOrder.ASCENDING, false, false); + Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", false, false); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index 10675cc5..66004dee 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.SearchOrder; import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; @@ -136,7 +135,7 @@ public class EdgeSearchOperator { sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block)); - EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", SearchOrder.ASCENDING, EdgeSearchProfile.YOLO.equals(profile), false); + EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", EdgeSearchProfile.YOLO.equals(profile), false); return performQuery(ctx, new EdgeSearchQuery(specs), true); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index 05fcaa04..212d09ab 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -1,7 +1,6 @@ package nu.marginalia.wmsa.edge.search; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.SearchOrder; import java.util.Arrays; import java.util.Collections; @@ -9,27 +8,27 @@ import java.util.List; import java.util.stream.Collectors; public enum EdgeSearchProfile { - DEFAULT("default", SearchOrder.ASCENDING, + DEFAULT("default", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 0, 1), - MODERN("modern", SearchOrder.ASCENDING, + MODERN("modern", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 2), - CORPO("corpo", SearchOrder.ASCENDING, + CORPO("corpo", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 4, 5, 6, 7), - YOLO("yolo", SearchOrder.ASCENDING, + YOLO("yolo", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 0, 2, 1, 3, 4, 6), - CORPO_CLEAN("corpo-clean", SearchOrder.ASCENDING, + CORPO_CLEAN("corpo-clean", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 4, 5), - ACADEMIA("academia", SearchOrder.ASCENDING, + ACADEMIA("academia", Collections.emptyList(), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 3), @@ -37,17 +36,15 @@ public enum EdgeSearchProfile { public final String name; - public final SearchOrder order; public final List additionalSearchTerm; public final List buckets; public final List indexBlocks; - EdgeSearchProfile(String name, SearchOrder order, + EdgeSearchProfile(String name, List additionalSearchTerm, List indexBlocks, int... buckets) { this.name = name; - this.order = order; this.additionalSearchTerm = additionalSearchTerm; this.indexBlocks = indexBlocks; this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java index 60520aa9..6e341721 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java @@ -32,7 +32,7 @@ import java.util.regex.Pattern; public class SiteSearchCommand implements SearchCommandInterface { private final EdgeDataStoreDao dataStoreDao; private final EdgeSearchOperator searchOperator; - private DomainInformationService domainInformationService; + private final DomainInformationService domainInformationService; private final Logger logger = LoggerFactory.getLogger(getClass()); private final MustacheRenderer siteInfoRenderer; @@ -91,7 +91,7 @@ public class SiteSearchCommand implements SearchCommandInterface { logger.info("Fetching Site Info: {}", word); var results = domainInformationService.domainInfo(word) - .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); + .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); logger.debug("Results = {}", results); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java index c5c19187..d94ae487 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DomainInformation.java @@ -18,7 +18,6 @@ public class DomainInformation { int pagesIndexed; int incomingLinks; int outboundLinks; - double nominalQuality; double ranking; EdgeDomainIndexingState state; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index d3775dd9..1d77a9d0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -138,7 +138,6 @@ public class QueryFactory { .subqueries(subqueries) .limitByBucket(50) .limitTotal(100) - .searchOrder(profile.order) .humanQuery(query) .buckets(profile.buckets); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java index 496fe57b..2f79a9ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java @@ -57,10 +57,9 @@ public class DomainInformationService { int outboundLinks = getOutboundLinks(domainId); double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100; EdgeDomainIndexingState state = getDomainState(domainId); - double nominalQuality = Math.round(100*100*Math.exp(getDomainQuality(domainId)))/100.; List linkingDomains = getLinkingDomains(domainId); - return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains)); + return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, rank, state, linkingDomains)); } private EdgeId getDomainFromPartial(String site) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java index 1251f626..05c67481 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexMergerMain.java @@ -8,8 +8,8 @@ import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.index.model.RankingSettings; -import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexDao; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import org.mariadb.jdbc.Driver; import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb index 5696b251..cd8abf67 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/site-info-gmi.hdb @@ -10,5 +10,4 @@ Pages Known: {{pagesKnown}} Pages Indexed: {{pagesKnown}} Inbound Links: {{inboundLinks}} Outbound Links: {{outboundLinks}} -Nominal Quality: {{nominalQuality}}% Crawl Ranking: {{ranking}}% \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb index 19b585b8..837f320d 100644 --- a/marginalia_nu/src/main/resources/templates/edge/site-info.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/site-info.hdb @@ -37,7 +37,6 @@

Links

- Nominal Quality: {{nominalQuality}}%
Crawl Ranking: {{ranking}}%
Incoming Links: {{incomingLinks}}
Outbound Links: {{outboundLinks}}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java index 1915d989..875cda37 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java @@ -90,10 +90,10 @@ class BTreeWriterTest { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + 2L*i, data[i]); - mmf.put(offset + 2L*i + 1, i); + slice.put(2L*i, data[i]); + slice.put( 2L*i + 1, i); } }); mmf.force(); @@ -133,10 +133,10 @@ class BTreeWriterTest { { var writer = new BTreeWriter(mmf, ctx); - writer.write( 0, toPut.size(), (offset) -> { + writer.write( 0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + 2L*i, data[i]); - mmf.put(offset + 2L*i + 1, i); + slice.put(2L*i, data[i]); + slice.put(2L*i + 1, i); } }); mmf.force(); @@ -182,9 +182,9 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + i, data[i]); + slice.put(i, data[i]); } }); mmf.force(); @@ -235,9 +235,9 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + i, data[i]); + slice.put(i, data[i]); } }); mmf.force(); @@ -288,10 +288,10 @@ class BTreeWriterTest { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { { var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (offset) -> { + writer.write(0, toPut.size(), (slice) -> { for (int i = 0; i < data.length; i++) { - mmf.put(offset + i*2L, data[i]); - mmf.put(offset + i*2L+1, i); + slice.put(i*2L, data[i]); + slice.put(i*2L+1, i); } }); mmf.force(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java index 326c9b15..9331a998 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java @@ -27,7 +27,7 @@ class LongPairHashMapTest { try { RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); - var lphm = new LongPairHashMap(mmf, 1024); + var lphm = LongPairHashMap.createNew(mmf, 1024); toPut.forEach(i -> { lphm.put(new LongPairHashMap.CellData(i, i)); }); @@ -36,7 +36,7 @@ class LongPairHashMapTest { RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw"); MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); - var lphm2 = new LongPairHashMap(mmf2); + var lphm2 = LongPairHashMap.loadExisting(mmf2); toPut.forEach(i -> { Assertions.assertTrue(lphm2.get(i).isSet()); Assertions.assertEquals(i, (int) lphm2.get(i).getKey()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java index b6e61aa2..961d8304 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java @@ -1,11 +1,11 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java index 6b029da9..2b2da0fd 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java @@ -3,14 +3,14 @@ package nu.marginalia.wmsa.edge.index.service; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.TestUtil; -import nu.marginalia.wmsa.client.exception.RemoteException; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.EdgeIndexService; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.EdgeId; @@ -23,7 +23,6 @@ import org.junit.jupiter.api.parallel.ResourceAccessMode; import org.junit.jupiter.api.parallel.ResourceLock; import spark.Spark; -import java.io.File; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; @@ -31,7 +30,6 @@ import java.util.List; import java.util.stream.Collectors; import static nu.marginalia.util.TestUtil.getConnection; -import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java index 6b219bad..edcfa71f 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java @@ -1,14 +1,14 @@ package nu.marginalia.wmsa.edge.index.service; import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl; +import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.model.EdgeId; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index 4aa9bceb..65b1ad57 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -13,6 +13,7 @@ class QueryVariantsTest { QueryVariants variants; QueryParser parser; SentenceExtractor se; + @BeforeEach public void setUp() { LanguageModels lm = TestLanguageModels.getLanguageModels(); @@ -24,7 +25,7 @@ class QueryVariantsTest { parser = new QueryParser(new EnglishDictionary(dict), variants); } - @Test + @Test @SuppressWarnings("unchecked") void getQueryVariants() { System.out.println(se.extractSentence("we are alone")); testCase("DOS", List.of("DOS")); @@ -50,7 +51,5 @@ class QueryVariantsTest { private void testCase(String input, List... expected) { var tokens = variants.getQueryVariants(parser.extractBasicTokens(input)); System.out.println(tokens); -// var result = tokens.stream().map(lst -> lst.terms).collect(Collectors.toSet()); -// assertEquals(Set.of(expected), result, "Case failed: " + input); } } \ No newline at end of file