Refactoring conversion
This commit is contained in:
parent
8ba80931a9
commit
88908c203d
@ -28,6 +28,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
|
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
@Tag("e2e")
|
@Tag("e2e")
|
||||||
@Testcontainers
|
@Testcontainers
|
||||||
@ -156,6 +157,16 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
return wikipediaFiles.toString();
|
return wikipediaFiles.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<String> getTitlesFromSearchResults(String html) {
|
||||||
|
List<String> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var title : Jsoup.parse(html).select(".card.search-result > h2")) {
|
||||||
|
ret.add(title.text());
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFrontPage() throws IOException {
|
public void testFrontPage() throws IOException {
|
||||||
var driver = chrome.getWebDriver();
|
var driver = chrome.getWebDriver();
|
||||||
@ -173,8 +184,9 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
|
|
||||||
driver.get("http://proxyNginx/search?query=bird&profile=corpo");
|
driver.get("http://proxyNginx/search?query=bird&profile=corpo");
|
||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
|
||||||
|
|
||||||
|
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||||
|
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
|
||||||
}
|
}
|
||||||
@ -187,20 +199,23 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||||
|
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSiteSearch() throws IOException {
|
public void testSiteSearch() throws IOException {
|
||||||
var driver = chrome.getWebDriver();
|
var driver = chrome.getWebDriver();
|
||||||
|
|
||||||
driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog");
|
driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog");
|
||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
|
||||||
|
|
||||||
|
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||||
|
|
||||||
|
assertEquals(List.of("Frog", "Binomial nomenclature", "Amphibian", "Mantis"), getTitlesFromSearchResults(html));
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testBrowse() throws IOException {
|
public void testBrowse() throws IOException {
|
||||||
var driver = chrome.getWebDriver();
|
var driver = chrome.getWebDriver();
|
||||||
@ -209,7 +224,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||||
|
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
|
||||||
}
|
}
|
||||||
@Test
|
@Test
|
||||||
@ -220,7 +234,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||||
|
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
|
||||||
}
|
}
|
||||||
@Test
|
@Test
|
||||||
|
@ -69,4 +69,4 @@ memex memex
|
|||||||
dating dating
|
dating dating
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
|
WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
|
@ -3,6 +3,7 @@ package nu.marginalia.util.btree;
|
|||||||
import nu.marginalia.util.btree.model.BTreeContext;
|
import nu.marginalia.util.btree.model.BTreeContext;
|
||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
|
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -12,9 +13,9 @@ import java.io.IOException;
|
|||||||
public class BTreeWriter {
|
public class BTreeWriter {
|
||||||
private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class);
|
private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class);
|
||||||
private final BTreeContext ctx;
|
private final BTreeContext ctx;
|
||||||
private final MultimapFileLong map;
|
private final MultimapFileLongSlice map;
|
||||||
|
|
||||||
public BTreeWriter(MultimapFileLong map, BTreeContext ctx) {
|
public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
|
||||||
this.map = map;
|
this.map = map;
|
||||||
this.ctx = ctx;
|
this.ctx = ctx;
|
||||||
}
|
}
|
||||||
@ -31,13 +32,18 @@ public class BTreeWriter {
|
|||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long write(long offset, int numEntries, WriteCallback writeIndex)
|
/** Construct a BTree with numEntries entries at offset in the associated map
|
||||||
|
*
|
||||||
|
* @return The size of the written data
|
||||||
|
*/
|
||||||
|
public long write(long offset, int numEntries, WriteCallback writeIndexCallback)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
var header = makeHeader(offset, numEntries);
|
BTreeHeader header = makeHeader(offset, numEntries);
|
||||||
|
|
||||||
header.write(map, offset);
|
header.write(map, offset);
|
||||||
writeIndex.write(header.dataOffsetLongs());
|
|
||||||
|
writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
|
||||||
|
|
||||||
if (header.layers() < 1) {
|
if (header.layers() < 1) {
|
||||||
return ctx.calculateSize(numEntries);
|
return ctx.calculateSize(numEntries);
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
package nu.marginalia.util.btree;
|
package nu.marginalia.util.btree;
|
||||||
|
|
||||||
|
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
public interface WriteCallback {
|
public interface WriteCallback {
|
||||||
void write(long offset) throws IOException;
|
void write(MultimapFileLongSlice slice) throws IOException;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.util.btree.model;
|
package nu.marginalia.util.btree.model;
|
||||||
|
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
|
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||||
|
|
||||||
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
|
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
|
||||||
public BTreeHeader {
|
public BTreeHeader {
|
||||||
@ -28,7 +29,7 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon
|
|||||||
return padding;
|
return padding;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(MultimapFileLong dest, long offset) {
|
public void write(MultimapFileLongSlice dest, long offset) {
|
||||||
dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
|
dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
|
||||||
dest.put(offset+1, indexOffsetLongs);
|
dest.put(offset+1, indexOffsetLongs);
|
||||||
dest.put(offset+2, dataOffsetLongs);
|
dest.put(offset+2, dataOffsetLongs);
|
||||||
|
@ -1,9 +1,7 @@
|
|||||||
package nu.marginalia.util.hash;
|
package nu.marginalia.util.hash;
|
||||||
|
|
||||||
import io.prometheus.client.Gauge;
|
|
||||||
import lombok.EqualsAndHashCode;
|
import lombok.EqualsAndHashCode;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable;
|
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
import nu.marginalia.util.PrimeUtil;
|
import nu.marginalia.util.PrimeUtil;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -17,9 +15,7 @@ import static java.lang.Math.round;
|
|||||||
*/
|
*/
|
||||||
public class LongPairHashMap {
|
public class LongPairHashMap {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class);
|
private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class);
|
||||||
private static final Gauge probe_count_metrics
|
private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police
|
||||||
= Gauge.build("wmsa_wordfile_hash_map_probe_count", "Probing Count")
|
|
||||||
.register();
|
|
||||||
|
|
||||||
private final long hashTableSize;
|
private final long hashTableSize;
|
||||||
private final MultimapFileLong data;
|
private final MultimapFileLong data;
|
||||||
@ -27,26 +23,37 @@ public class LongPairHashMap {
|
|||||||
private int sz = 0;
|
private int sz = 0;
|
||||||
private static final int HEADER_SIZE = 2;
|
private static final int HEADER_SIZE = 2;
|
||||||
|
|
||||||
public LongPairHashMap(MultimapFileLong data, long size) {
|
private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) {
|
||||||
this.data = data;
|
this.data = data;
|
||||||
// Actually use a prime size for Donald Knuth reasons
|
this.hashTableSize = hashTableSize;
|
||||||
hashTableSize = PrimeUtil.nextPrime(size, 1);
|
this.maxProbeLength = maxProbeLength;
|
||||||
maxProbeLength = hashTableSize / 2;
|
}
|
||||||
|
|
||||||
logger.debug("Table size = " + hashTableSize);
|
public static LongPairHashMap createNew(MultimapFileLong data, long size) {
|
||||||
|
var tableSize = PrimeUtil.nextPrime(size, 1);
|
||||||
|
var ret = new LongPairHashMap(data, tableSize, tableSize/2);
|
||||||
|
|
||||||
data.put(0, IndexWordsTable.Strategy.HASH.ordinal());
|
data.put(0, MAGIC_WORD);
|
||||||
data.put(1, hashTableSize);
|
data.put(1, tableSize);
|
||||||
for (int i = 2; i < hashTableSize; i++) {
|
|
||||||
|
for (int i = 2; i < tableSize; i++) {
|
||||||
data.put(HEADER_SIZE + 2L*i, 0);
|
data.put(HEADER_SIZE + 2L*i, 0);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
public LongPairHashMap(MultimapFileLong data) {
|
|
||||||
this.data = data;
|
|
||||||
hashTableSize = data.get(1);
|
|
||||||
maxProbeLength = hashTableSize / 10;
|
|
||||||
|
|
||||||
logger.debug("Table size = " + hashTableSize);
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static LongPairHashMap loadExisting(MultimapFileLong data) {
|
||||||
|
long key = data.get(0);
|
||||||
|
|
||||||
|
if (key != MAGIC_WORD) {
|
||||||
|
logger.warn("LongPairHashMap lacks magic word, could this be garbage data?");
|
||||||
|
}
|
||||||
|
|
||||||
|
var hashTableSize = data.get(1);
|
||||||
|
var maxProbeLength = hashTableSize / 10;
|
||||||
|
|
||||||
|
return new LongPairHashMap(data, hashTableSize, maxProbeLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
@ -91,8 +98,6 @@ public class LongPairHashMap {
|
|||||||
final var val = getCell(idx);
|
final var val = getCell(idx);
|
||||||
|
|
||||||
if (!val.isSet()) {
|
if (!val.isSet()) {
|
||||||
probe_count_metrics.set(j);
|
|
||||||
|
|
||||||
return setValue(data, idx);
|
return setValue(data, idx);
|
||||||
}
|
}
|
||||||
else if (val.getKey() == data.getKey()) {
|
else if (val.getKey() == data.getKey()) {
|
||||||
|
@ -21,7 +21,7 @@ import static java.nio.channels.FileChannel.MapMode.READ_WRITE;
|
|||||||
import static nu.marginalia.util.FileSizeUtil.readableSize;
|
import static nu.marginalia.util.FileSizeUtil.readableSize;
|
||||||
|
|
||||||
|
|
||||||
public class MultimapFileLong implements AutoCloseable {
|
public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||||
|
|
||||||
private final ArrayList<LongBuffer> buffers = new ArrayList<>();
|
private final ArrayList<LongBuffer> buffers = new ArrayList<>();
|
||||||
private final ArrayList<MappedByteBuffer> mappedByteBuffers = new ArrayList<>();
|
private final ArrayList<MappedByteBuffer> mappedByteBuffers = new ArrayList<>();
|
||||||
@ -196,10 +196,12 @@ public class MultimapFileLong implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public long size() {
|
public long size() {
|
||||||
return fileLength;
|
return fileLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void put(long idx, long val) {
|
public void put(long idx, long val) {
|
||||||
if (idx >= mappedSize)
|
if (idx >= mappedSize)
|
||||||
grow(idx);
|
grow(idx);
|
||||||
@ -214,6 +216,7 @@ public class MultimapFileLong implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public long get(long idx) {
|
public long get(long idx) {
|
||||||
if (idx >= mappedSize)
|
if (idx >= mappedSize)
|
||||||
grow(idx);
|
grow(idx);
|
||||||
@ -229,10 +232,12 @@ public class MultimapFileLong implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
public void read(long[] vals, long idx) {
|
public void read(long[] vals, long idx) {
|
||||||
read(vals, vals.length, idx);
|
read(vals, vals.length, idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void read(long[] vals, int n, long idx) {
|
public void read(long[] vals, int n, long idx) {
|
||||||
if (idx+n >= mappedSize) {
|
if (idx+n >= mappedSize) {
|
||||||
grow(idx+n);
|
grow(idx+n);
|
||||||
@ -257,10 +262,12 @@ public class MultimapFileLong implements AutoCloseable {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void write(long[] vals, long idx) {
|
public void write(long[] vals, long idx) {
|
||||||
write(vals, vals.length, idx);
|
write(vals, vals.length, idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void write(long[] vals, int n, long idx) {
|
public void write(long[] vals, int n, long idx) {
|
||||||
if (idx+n >= mappedSize) {
|
if (idx+n >= mappedSize) {
|
||||||
grow(idx+n);
|
grow(idx+n);
|
||||||
@ -285,6 +292,7 @@ public class MultimapFileLong implements AutoCloseable {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void write(LongBuffer vals, long idx) {
|
public void write(LongBuffer vals, long idx) {
|
||||||
int n = vals.limit() - vals.position();
|
int n = vals.limit() - vals.position();
|
||||||
if (idx+n >= mappedSize) {
|
if (idx+n >= mappedSize) {
|
||||||
@ -310,6 +318,7 @@ public class MultimapFileLong implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
|
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
|
||||||
|
|
||||||
int length = (int)(sourceEnd - sourceStart);
|
int length = (int)(sourceEnd - sourceStart);
|
||||||
|
@ -0,0 +1,70 @@
|
|||||||
|
package nu.marginalia.util.multimap;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.LongBuffer;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
|
public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
|
||||||
|
private final long off;
|
||||||
|
private final MultimapFileLongSlice map;
|
||||||
|
|
||||||
|
public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) {
|
||||||
|
this.off = off;
|
||||||
|
this.map = map;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long size() {
|
||||||
|
return map.size() - off;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void put(long idx, long val) {
|
||||||
|
map.put(off+idx, val);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long get(long idx) {
|
||||||
|
return map.get(off+idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void read(long[] vals, long idx) {
|
||||||
|
map.read(vals, idx+off);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void read(long[] vals, int n, long idx) {
|
||||||
|
map.read(vals, n, idx+off);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(long[] vals, long idx) {
|
||||||
|
map.write(vals, idx+off);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(long[] vals, int n, long idx) {
|
||||||
|
map.write(vals, n, idx+off);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(LongBuffer vals, long idx) {
|
||||||
|
map.write(vals, idx+off);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
|
||||||
|
throws IOException {
|
||||||
|
map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MultimapFileLongSlice atOffset(long off) {
|
||||||
|
// If we don't override this, the default implementation would build a pyramid of
|
||||||
|
// MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...)))
|
||||||
|
// if this is called iteratively (e.g. to walk over a file)
|
||||||
|
|
||||||
|
return new MultimapFileLongOffsetSlice(map, this.off + off);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,29 @@
|
|||||||
|
package nu.marginalia.util.multimap;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.LongBuffer;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
|
public interface MultimapFileLongSlice {
|
||||||
|
long size();
|
||||||
|
|
||||||
|
void put(long idx, long val);
|
||||||
|
|
||||||
|
long get(long idx);
|
||||||
|
|
||||||
|
void read(long[] vals, long idx);
|
||||||
|
|
||||||
|
void read(long[] vals, int n, long idx);
|
||||||
|
|
||||||
|
void write(long[] vals, long idx);
|
||||||
|
|
||||||
|
void write(long[] vals, int n, long idx);
|
||||||
|
|
||||||
|
void write(LongBuffer vals, long idx);
|
||||||
|
|
||||||
|
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
|
||||||
|
|
||||||
|
default MultimapFileLongSlice atOffset(long off) {
|
||||||
|
return new MultimapFileLongOffsetSlice(this, off);
|
||||||
|
}
|
||||||
|
}
|
@ -4,9 +4,9 @@ import lombok.experimental.Delegate;
|
|||||||
|
|
||||||
public class MultimapSearcher {
|
public class MultimapSearcher {
|
||||||
@Delegate
|
@Delegate
|
||||||
private final MultimapFileLong mmf;
|
private final MultimapFileLongSlice mmf;
|
||||||
|
|
||||||
public MultimapSearcher(MultimapFileLong mmf) {
|
public MultimapSearcher(MultimapFileLongSlice mmf) {
|
||||||
this.mmf = mmf;
|
this.mmf = mmf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,10 +13,10 @@ import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
|
|||||||
public class MultimapSorter {
|
public class MultimapSorter {
|
||||||
private final Path tmpFileDir;
|
private final Path tmpFileDir;
|
||||||
private final int internalSortLimit;
|
private final int internalSortLimit;
|
||||||
private final MultimapFileLong multimapFileLong;
|
private final MultimapFileLongSlice multimapFileLong;
|
||||||
private final long[] buffer;
|
private final long[] buffer;
|
||||||
|
|
||||||
public MultimapSorter(MultimapFileLong multimapFileLong, Path tmpFileDir, int internalSortLimit) {
|
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) {
|
||||||
this.multimapFileLong = multimapFileLong;
|
this.multimapFileLong = multimapFileLong;
|
||||||
this.tmpFileDir = tmpFileDir;
|
this.tmpFileDir = tmpFileDir;
|
||||||
this.internalSortLimit = internalSortLimit;
|
this.internalSortLimit = internalSortLimit;
|
||||||
|
@ -48,7 +48,7 @@ public class SqlLoadProcessedDocument {
|
|||||||
IN STATE VARCHAR(32))
|
IN STATE VARCHAR(32))
|
||||||
BEGIN
|
BEGIN
|
||||||
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
||||||
DELETE FROM PAGE_DATA WHERE ID=URL_ID;
|
DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID;
|
||||||
END
|
END
|
||||||
""");
|
""");
|
||||||
|
|
||||||
|
@ -135,7 +135,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
final Set<BrowseResult> domains = new HashSet<>(count*3);
|
final Set<BrowseResult> domains = new HashSet<>(count*3);
|
||||||
|
|
||||||
final String q = """
|
final String q = """
|
||||||
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART, COUNT(*) AS CNT
|
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT
|
||||||
FROM EC_DOMAIN_NEIGHBORS
|
FROM EC_DOMAIN_NEIGHBORS
|
||||||
INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID
|
INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
@ -169,7 +169,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
|
|
||||||
if (domains.size() < count/2) {
|
if (domains.size() < count/2) {
|
||||||
final String q2 = """
|
final String q2 = """
|
||||||
SELECT EC_DOMAIN.ID, URL_PART
|
SELECT EC_DOMAIN.ID, DOMAIN_NAME
|
||||||
FROM EC_DOMAIN
|
FROM EC_DOMAIN
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID
|
INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID
|
||||||
@ -199,11 +199,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
|
|
||||||
if (domains.size() < count/2) {
|
if (domains.size() < count/2) {
|
||||||
final String q3 = """
|
final String q3 = """
|
||||||
SELECT EC_DOMAIN.ID, URL_PART
|
SELECT EC_DOMAIN.ID, DOMAIN_NAME
|
||||||
FROM EC_DOMAIN
|
FROM EC_DOMAIN
|
||||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
|
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
|
||||||
INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID
|
INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID
|
||||||
WHERE B.DEST_DOMAIN_ID=?
|
WHERE B.DEST_DOMAIN_ID=?
|
||||||
AND STATE<2
|
AND STATE<2
|
||||||
AND KNOWN_URLS<1000
|
AND KNOWN_URLS<1000
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.radix;
|
package nu.marginalia.wmsa.edge.index;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.EdgeIndexControl;
|
import nu.marginalia.wmsa.edge.index.EdgeIndexControl;
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriter;
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriter;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.Query;
|
import nu.marginalia.wmsa.edge.index.reader.query.Query;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException;
|
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
public class EdgeIndexControl {
|
public class EdgeIndexControl {
|
||||||
@ -27,7 +29,10 @@ public class EdgeIndexControl {
|
|||||||
System.gc();
|
System.gc();
|
||||||
}
|
}
|
||||||
catch (ConversionUnnecessaryException unnecessary) {
|
catch (ConversionUnnecessaryException unnecessary) {
|
||||||
|
// swallow quietly
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,9 +15,9 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
|
|||||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||||
import nu.marginalia.wmsa.configuration.server.Service;
|
import nu.marginalia.wmsa.configuration.server.Service;
|
||||||
import nu.marginalia.wmsa.edge.index.model.*;
|
import nu.marginalia.wmsa.edge.index.model.*;
|
||||||
import nu.marginalia.wmsa.edge.index.service.SearchIndexes;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
import nu.marginalia.wmsa.edge.model.*;
|
import nu.marginalia.wmsa.edge.model.*;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
|
@ -5,12 +5,16 @@ import com.google.inject.Singleton;
|
|||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
|
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
|
||||||
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||||
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket;
|
|
||||||
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
|
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
|
||||||
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
|
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.*;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||||
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -89,7 +93,7 @@ public class IndexServicesFactory {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException {
|
public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
|
||||||
return new SearchIndexConverter(block, id, tmpFileDir,
|
return new SearchIndexConverter(block, id, tmpFileDir,
|
||||||
preconverterOutputFile.get(id),
|
preconverterOutputFile.get(id),
|
||||||
indexWriteWordsFile.get(id, block.id),
|
indexWriteWordsFile.get(id, block.id),
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index;
|
package nu.marginalia.wmsa.edge.index.conversion;
|
||||||
|
|
||||||
public class ConversionUnnecessaryException extends Exception {
|
public class ConversionUnnecessaryException extends Exception {
|
||||||
public ConversionUnnecessaryException() {
|
public ConversionUnnecessaryException() {
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service;
|
package nu.marginalia.wmsa.edge.index.conversion;
|
||||||
|
|
||||||
import gnu.trove.list.TIntList;
|
import gnu.trove.list.TIntList;
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
import gnu.trove.map.hash.TIntIntHashMap;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index;
|
package nu.marginalia.wmsa.edge.index.conversion;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
@ -6,9 +6,10 @@ import gnu.trove.set.hash.TIntHashSet;
|
|||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
|
import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter;
|
import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
|
||||||
import nu.marginalia.util.btree.BTreeWriter;
|
import nu.marginalia.util.btree.BTreeWriter;
|
||||||
import nu.marginalia.util.btree.model.BTreeContext;
|
import nu.marginalia.util.btree.model.BTreeContext;
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
@ -32,18 +33,24 @@ public class SearchIndexConverter {
|
|||||||
|
|
||||||
private final long fileLength;
|
private final long fileLength;
|
||||||
private final long urlsFileSize;
|
private final long urlsFileSize;
|
||||||
|
private final Path tmpFileDir;
|
||||||
|
|
||||||
private final FileChannel urlsTmpFileChannel;
|
private final FileChannel urlsTmpFileChannel;
|
||||||
private final int wordCount;
|
private final int wordCount;
|
||||||
private final MultimapFileLong urlsTmpFileMap;
|
private final MultimapFileLong urlsTmpFileMap;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final IndexBlock block;
|
private final IndexBlock block;
|
||||||
private final int bucketId;
|
private final int bucketId;
|
||||||
@org.jetbrains.annotations.NotNull
|
|
||||||
|
|
||||||
private final File urlsFile;
|
private final File urlsFile;
|
||||||
private final SearchIndexPartitioner partitioner;
|
private final SearchIndexPartitioner partitioner;
|
||||||
private final TIntHashSet spamDomains;
|
private final TIntHashSet spamDomains;
|
||||||
private final MultimapSorter urlTmpFileSorter;
|
private final MultimapSorter urlTmpFileSorter;
|
||||||
|
|
||||||
|
private final static int internalSortLimit =
|
||||||
|
Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256;
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static long wordCount(File inputFile) {
|
public static long wordCount(File inputFile) {
|
||||||
try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) {
|
try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) {
|
||||||
@ -52,7 +59,6 @@ public class SearchIndexConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchIndexConverter(IndexBlock block,
|
public SearchIndexConverter(IndexBlock block,
|
||||||
int bucketId, @Named("tmp-file-dir") Path tmpFileDir,
|
int bucketId, @Named("tmp-file-dir") Path tmpFileDir,
|
||||||
@ -61,13 +67,15 @@ public class SearchIndexConverter {
|
|||||||
@Named("edge-index-write-urls-file") File outputFileUrls,
|
@Named("edge-index-write-urls-file") File outputFileUrls,
|
||||||
SearchIndexPartitioner partitioner,
|
SearchIndexPartitioner partitioner,
|
||||||
EdgeDomainBlacklist blacklist)
|
EdgeDomainBlacklist blacklist)
|
||||||
throws ConversionUnnecessaryException
|
throws ConversionUnnecessaryException, IOException
|
||||||
{
|
{
|
||||||
this.block = block;
|
this.block = block;
|
||||||
this.bucketId = bucketId;
|
this.bucketId = bucketId;
|
||||||
urlsFile = outputFileUrls;
|
this.tmpFileDir = tmpFileDir;
|
||||||
|
this.urlsFile = outputFileUrls;
|
||||||
this.partitioner = partitioner;
|
this.partitioner = partitioner;
|
||||||
this.spamDomains = blacklist.getSpamDomains();
|
this.spamDomains = blacklist.getSpamDomains();
|
||||||
|
|
||||||
logger.info("Converting {} ({}) {}", block.id, block, inputFile);
|
logger.info("Converting {} ({}) {}", block.id, block, inputFile);
|
||||||
|
|
||||||
Files.deleteIfExists(outputFileWords.toPath());
|
Files.deleteIfExists(outputFileWords.toPath());
|
||||||
@ -89,18 +97,16 @@ public class SearchIndexConverter {
|
|||||||
urlsFileSize = getUrlsSize(buffer, inputChannel);
|
urlsFileSize = getUrlsSize(buffer, inputChannel);
|
||||||
|
|
||||||
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
||||||
|
|
||||||
|
|
||||||
var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
||||||
urlsTmpFileChannel = urlsTmpFileRaf.getChannel();
|
urlsTmpFileChannel = urlsTmpFileRaf.getChannel();
|
||||||
urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false);
|
urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false);
|
||||||
urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256);
|
urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
|
||||||
|
|
||||||
logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
|
logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
|
||||||
long[] wordIndexTable = createWordIndexTable(outputFileWords, inputChannel);
|
WordIndexOffsetsTable wordIndexTable = createWordIndexTable(outputFileWords, inputChannel);
|
||||||
|
|
||||||
logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
|
logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
|
||||||
createUrlTable(tmpFileDir, buffer, raf, wordIndexTable);
|
createUrlTable(buffer, raf, wordIndexTable);
|
||||||
|
|
||||||
Files.delete(tmpUrlsFile);
|
Files.delete(tmpUrlsFile);
|
||||||
raf.close();
|
raf.close();
|
||||||
@ -140,99 +146,69 @@ public class SearchIndexConverter {
|
|||||||
return reader.size;
|
return reader.size;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException {
|
private void createUrlTable(ByteBuffer buffer, RandomAccessFile raf, WordIndexOffsetsTable wordOffsetsTable) throws IOException {
|
||||||
logger.debug("Table size = {}", wordIndexTable.length);
|
logger.info("Table size = {}", wordOffsetsTable.length());
|
||||||
int[] wordIndex = new int[wordIndexTable.length];
|
|
||||||
raf.seek(FILE_HEADER_SIZE);
|
raf.seek(FILE_HEADER_SIZE);
|
||||||
|
|
||||||
var channel = raf.getChannel();
|
var channel = raf.getChannel();
|
||||||
|
|
||||||
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
|
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
|
||||||
var reader = new IndexReader(buffer, channel) {
|
int[] wordWriteOffset = new int[wordOffsetsTable.length()];
|
||||||
|
|
||||||
|
new IndexReader(buffer, channel) {
|
||||||
@Override
|
@Override
|
||||||
public void eachWord(long urlId, int wordId) throws IOException {
|
public void eachWord(long urlId, int wordId) throws IOException {
|
||||||
if (wordId >= wordIndex.length)
|
if (wordId >= wordWriteOffset.length)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (wordId != 0) {
|
|
||||||
if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) {
|
|
||||||
logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}",
|
|
||||||
wordId,
|
|
||||||
wordIndex[wordId],
|
|
||||||
wordIndexTable[wordId - 1],
|
|
||||||
wordIndexTable[wordId]);
|
|
||||||
throw new IllegalStateException();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (wordId > 0) {
|
if (wordId > 0) {
|
||||||
rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId));
|
rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, translateUrl(urlId));
|
||||||
} else {
|
} else {
|
||||||
rwf.put(wordIndex[wordId]++, translateUrl(urlId));
|
rwf.put(wordWriteOffset[wordId]++, translateUrl(urlId));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
}.read();
|
||||||
|
|
||||||
reader.read();
|
|
||||||
|
|
||||||
rwf.write(urlsTmpFileChannel);
|
rwf.write(urlsTmpFileChannel);
|
||||||
}
|
}
|
||||||
|
|
||||||
urlsTmpFileChannel.force(false);
|
urlsTmpFileChannel.force(false);
|
||||||
|
logger.info("URL TMP Table: {} Mb", channel.position()/(1024*1024));
|
||||||
|
|
||||||
logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024));
|
if (wordOffsetsTable.length() > 0) {
|
||||||
|
logger.info("Sorting urls table");
|
||||||
|
|
||||||
|
wordOffsetsTable.forEach(urlTmpFileSorter::sort);
|
||||||
|
|
||||||
if (wordIndexTable.length > 0) {
|
|
||||||
logger.debug("Sorting urls table");
|
|
||||||
sortUrls(wordIndexTable);
|
|
||||||
urlsTmpFileMap.force();
|
urlsTmpFileMap.force();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
logger.warn("urls table empty -- nothing to sort");
|
logger.warn("urls table empty -- nothing to sort");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.info("Writing BTree");
|
||||||
long idx = 0;
|
|
||||||
|
|
||||||
try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
|
try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
|
||||||
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
|
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
|
||||||
|
|
||||||
if (wordIndexTable[0] != 0) {
|
wordOffsetsTable.fold((accumulatorIdx, start, length) -> {
|
||||||
int start = 0;
|
// Note: The return value is accumulated into accumulatorIdx!
|
||||||
int end = (int) wordIndexTable[0];
|
|
||||||
|
|
||||||
idx += writer.write(idx, (int) wordIndexTable[0],
|
return writer.write(accumulatorIdx, length,
|
||||||
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
|
slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
|
||||||
}
|
});
|
||||||
|
|
||||||
for (int i = 1; i < wordIndexTable.length; i++) {
|
|
||||||
if (wordIndexTable[i] != wordIndexTable[i - 1]) {
|
|
||||||
long start = wordIndexTable[i-1];
|
|
||||||
long end = wordIndexTable[i];
|
|
||||||
|
|
||||||
idx += writer.write(idx, (int) (end-start),
|
|
||||||
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
logger.error("Error while writing BTree", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
private WordIndexOffsetsTable createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws IOException {
|
||||||
private void sortUrls(long[] wordIndices) {
|
|
||||||
urlTmpFileSorter.sort( 0, (int) wordIndices[0]);
|
|
||||||
|
|
||||||
for (int i = 1; i < wordIndices.length; i++) {
|
|
||||||
urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception {
|
|
||||||
inputChannel.position(FILE_HEADER_SIZE);
|
inputChannel.position(FILE_HEADER_SIZE);
|
||||||
|
|
||||||
logger.debug("Table size = {}", wordCount);
|
logger.debug("Table size = {}", wordCount);
|
||||||
WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
|
WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
|
||||||
ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE);
|
ByteBuffer buffer = ByteBuffer.allocateDirect(8* SearchIndexWriterImpl.MAX_BLOCK_SIZE);
|
||||||
|
|
||||||
logger.debug("Reading words");
|
logger.debug("Reading words");
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service;
|
package nu.marginalia.wmsa.edge.index.conversion;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
@ -1,11 +1,9 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.query;
|
package nu.marginalia.wmsa.edge.index.conversion;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.index.service.SearchEngineRanking;
|
|
||||||
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
@ -1,10 +1,9 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index;
|
package nu.marginalia.wmsa.edge.index.conversion;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
@ -0,0 +1,10 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.conversion.words;
|
||||||
|
|
||||||
|
public class WordIndexLengthsTable {
|
||||||
|
final long[] table;
|
||||||
|
|
||||||
|
public WordIndexLengthsTable(int size) {
|
||||||
|
this.table = new long[size];
|
||||||
|
}
|
||||||
|
public void increment(int idx) { table[idx]++; }
|
||||||
|
}
|
@ -0,0 +1,67 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.conversion.words;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class WordIndexOffsetsTable {
|
||||||
|
final long[] table;
|
||||||
|
public final int numberOfUsedWords;
|
||||||
|
|
||||||
|
public WordIndexOffsetsTable(long[] table, int numberOfUsedWords) {
|
||||||
|
|
||||||
|
this.table = table;
|
||||||
|
this.numberOfUsedWords = numberOfUsedWords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int length() {
|
||||||
|
return table.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void forEach(OffsetTableEntryConsumer o) throws IOException {
|
||||||
|
if (table[0] > 0) {
|
||||||
|
o.accept(0, (int) table[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 1; i < table.length; i++) {
|
||||||
|
long start = table[i-1];
|
||||||
|
int length = (int) (table[i] - start);
|
||||||
|
|
||||||
|
if (length != 0) {
|
||||||
|
o.accept(start, length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fold over each span in the file, left to right
|
||||||
|
*/
|
||||||
|
public long fold(OffsetTableEntryFoldConsumer o) throws IOException {
|
||||||
|
long total = 0;
|
||||||
|
|
||||||
|
if (table[0] > 0) {
|
||||||
|
total = o.accept(total,0, (int) table[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 1; i < table.length; i++) {
|
||||||
|
long start = table[i-1];
|
||||||
|
int length = (int) (table[i] - start);
|
||||||
|
|
||||||
|
if (length != 0) {
|
||||||
|
total += o.accept(total, start, length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return total;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long get(int i) {
|
||||||
|
return table[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
public interface OffsetTableEntryConsumer {
|
||||||
|
void accept(long start, int length) throws IOException;
|
||||||
|
}
|
||||||
|
|
||||||
|
public interface OffsetTableEntryFoldConsumer {
|
||||||
|
long accept(long accumulator, long start, int length) throws IOException;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,56 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.conversion.words;
|
||||||
|
|
||||||
|
/** Contains a stateful table of word index offsets, initially in lengths mode
|
||||||
|
* where the table contains how many postings exist for each word; then in offsets
|
||||||
|
* mode, where the lengths are converted into the necessary offsets for each block
|
||||||
|
* of document data.
|
||||||
|
*
|
||||||
|
* Caveat! This uses the same underlying array to conserve space.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class WordIndexTables {
|
||||||
|
private WordIndexLengthsTable lengthsTable;
|
||||||
|
private WordIndexOffsetsTable offsetsTable;
|
||||||
|
|
||||||
|
private boolean converted = false;
|
||||||
|
|
||||||
|
public WordIndexTables(int size) {
|
||||||
|
lengthsTable = new WordIndexLengthsTable(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
public WordIndexLengthsTable lengths() {
|
||||||
|
if (converted) throw new IllegalStateException("Table has been converted");
|
||||||
|
|
||||||
|
return lengthsTable;
|
||||||
|
}
|
||||||
|
|
||||||
|
public WordIndexOffsetsTable offsets() {
|
||||||
|
if (!converted) throw new IllegalStateException("Table has not been converted");
|
||||||
|
|
||||||
|
return offsetsTable;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void convert() {
|
||||||
|
if (converted) throw new IllegalStateException("Table has been converted");
|
||||||
|
|
||||||
|
// Go from lengths to offsets, i.e.
|
||||||
|
// BEFORE: 1, 2, 1, 3, 0, 2
|
||||||
|
// AFTER: 1, 3, 4, 7, 7, 9
|
||||||
|
|
||||||
|
long[] table = lengthsTable.table;
|
||||||
|
int numberOfUsedWords = 0;
|
||||||
|
|
||||||
|
if (table[0] != 0) numberOfUsedWords = 1;
|
||||||
|
|
||||||
|
for (int i = 1; i < table.length; i++) {
|
||||||
|
if (table[i] != 0) {
|
||||||
|
numberOfUsedWords++;
|
||||||
|
}
|
||||||
|
table[i] += table[i-1];
|
||||||
|
}
|
||||||
|
|
||||||
|
lengthsTable = null;
|
||||||
|
offsetsTable = new WordIndexOffsetsTable(table, numberOfUsedWords);
|
||||||
|
converted = true;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,75 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.conversion.words;
|
||||||
|
|
||||||
|
import nu.marginalia.util.btree.BTreeWriter;
|
||||||
|
import nu.marginalia.util.btree.model.BTreeContext;
|
||||||
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
|
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext;
|
||||||
|
|
||||||
|
public class WordsTableWriter {
|
||||||
|
private final WordIndexTables table;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8);
|
||||||
|
|
||||||
|
public WordsTableWriter(int length) {
|
||||||
|
table = new WordIndexTables(length);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void acceptWord(int wordId) {
|
||||||
|
table.lengths().increment(wordId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public WordIndexOffsetsTable getTable() {
|
||||||
|
return table.offsets();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void write(File file) throws IOException {
|
||||||
|
table.convert();
|
||||||
|
|
||||||
|
logger.info("Writing table - {} max", table.offsets().numberOfUsedWords);
|
||||||
|
|
||||||
|
final int tableSize = table.offsets().numberOfUsedWords;
|
||||||
|
|
||||||
|
try (var mmf = MultimapFileLong.forOutput(file.toPath(), tableSize/8L)) {
|
||||||
|
mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal());
|
||||||
|
long offset = 1;
|
||||||
|
|
||||||
|
var writer = new BTreeWriter(mmf, wordsBTreeContext);
|
||||||
|
|
||||||
|
writer.write(offset, tableSize, this::writeBTreeBlock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeBTreeBlock(MultimapFileLongSlice mapSlice) {
|
||||||
|
long urlFileOffset = 0;
|
||||||
|
int idx = 0;
|
||||||
|
|
||||||
|
var offsetTable = table.offsets().table;
|
||||||
|
|
||||||
|
if (offsetTable[0] != 0) {
|
||||||
|
int length = (int) offsetTable[0];
|
||||||
|
mapSlice.put(idx++, (long)length<<32);
|
||||||
|
mapSlice.put(idx++, 0);
|
||||||
|
|
||||||
|
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 1; i < offsetTable.length; i++) {
|
||||||
|
final int length = (int)(offsetTable[i] - offsetTable[i-1]);
|
||||||
|
|
||||||
|
if (length > 0) {
|
||||||
|
mapSlice.put(idx++, (long)length << 32 | i);
|
||||||
|
mapSlice.put(idx++, urlFileOffset);
|
||||||
|
|
||||||
|
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index;
|
package nu.marginalia.wmsa.edge.index.journal;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index;
|
package nu.marginalia.wmsa.edge.index.journal;
|
||||||
|
|
||||||
import io.reactivex.rxjava3.disposables.Disposable;
|
import io.reactivex.rxjava3.disposables.Disposable;
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
@ -1,36 +1,80 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index.wordstable;
|
package nu.marginalia.wmsa.edge.index.reader;
|
||||||
|
|
||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import nu.marginalia.util.btree.BTreeReader;
|
import nu.marginalia.util.btree.BTreeReader;
|
||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.RandomAccessFile;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
import java.util.function.LongConsumer;
|
import java.util.function.LongConsumer;
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter.wordsBTreeContext;
|
import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wordsBTreeContext;
|
||||||
|
|
||||||
public class BtreeWordsTable extends IndexWordsTable{
|
public class IndexWordsTable implements AutoCloseable {
|
||||||
private final MultimapFileLong words;
|
protected final MultimapFileLong words;
|
||||||
private final BTreeReader reader;
|
protected final BTreeReader reader;
|
||||||
private final BTreeHeader header;
|
protected final BTreeHeader header;
|
||||||
private final int HEADER_OFFSET = 1;
|
protected final int HEADER_OFFSET = 1;
|
||||||
|
final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public BtreeWordsTable(MultimapFileLong words) {
|
private static final int BUFFER_SIZE = 1024*1024*64;
|
||||||
|
|
||||||
|
public IndexWordsTable(MultimapFileLong words) {
|
||||||
this.words = words;
|
this.words = words;
|
||||||
|
|
||||||
|
|
||||||
reader = new BTreeReader(words, wordsBTreeContext);
|
reader = new BTreeReader(words, wordsBTreeContext);
|
||||||
header = reader.getHeader(HEADER_OFFSET);
|
header = reader.getHeader(HEADER_OFFSET);
|
||||||
|
|
||||||
madvise();
|
madvise();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void madvise() {
|
public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException {
|
||||||
|
var wordsFile = openWordsFile(file);
|
||||||
|
long signature = wordsFile.get(0);
|
||||||
|
|
||||||
|
if (signature == Strategy.BTREE.ordinal()) {
|
||||||
|
return new IndexWordsTable(wordsFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new IllegalArgumentException("Unknown signature " + signature);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException {
|
||||||
|
return new MultimapFileLong(wordsFile,
|
||||||
|
FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long positionForWord(int wordId) {
|
||||||
|
|
||||||
|
long offset = reader.offsetForEntry(header, wordId);
|
||||||
|
if (offset < 0) {
|
||||||
|
return -1L;
|
||||||
|
}
|
||||||
|
|
||||||
|
return words.get(offset+1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int wordLength(int wordId) {
|
||||||
|
|
||||||
|
long offset = reader.offsetForEntry(header, wordId);
|
||||||
|
if (offset < 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (int)(words.get(offset) >> 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void madvise() {
|
||||||
words.advice(NativeIO.Advice.Random);
|
words.advice(NativeIO.Advice.Random);
|
||||||
words.advice0(NativeIO.Advice.WillNeed);
|
words.advice0(NativeIO.Advice.WillNeed);
|
||||||
|
|
||||||
var h = reader.getHeader(HEADER_OFFSET);
|
var h = reader.getHeader(HEADER_OFFSET);
|
||||||
int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs());
|
int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs());
|
||||||
|
|
||||||
words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length);
|
words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length);
|
||||||
words.pokeRange(h.indexOffsetLongs(), length);
|
words.pokeRange(h.indexOffsetLongs(), length);
|
||||||
}
|
}
|
||||||
@ -58,31 +102,13 @@ public class BtreeWordsTable extends IndexWordsTable{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public long positionForWord(int wordId) {
|
|
||||||
|
|
||||||
long offset = reader.offsetForEntry(header, wordId);
|
|
||||||
if (offset < 0) {
|
|
||||||
return -1L;
|
|
||||||
}
|
|
||||||
|
|
||||||
return words.get(offset+1);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int wordLength(int wordId) {
|
|
||||||
|
|
||||||
long offset = reader.offsetForEntry(header, wordId);
|
|
||||||
if (offset < 0) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return (int)(words.get(offset) >> 32);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() throws Exception {
|
||||||
words.close();
|
words.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public enum Strategy {
|
||||||
|
BTREE
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,20 +1,18 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index;
|
package nu.marginalia.wmsa.edge.index.reader;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable;
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||||
import nu.marginalia.util.btree.BTreeReader;
|
import nu.marginalia.util.btree.BTreeReader;
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
import org.eclipse.jetty.util.thread.ThreadPool;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
import java.util.concurrent.ForkJoinPool;
|
|
||||||
import java.util.stream.LongStream;
|
import java.util.stream.LongStream;
|
||||||
|
|
||||||
public class SearchIndex implements AutoCloseable {
|
public class SearchIndex implements AutoCloseable {
|
@ -1,13 +1,13 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index;
|
package nu.marginalia.wmsa.edge.index.reader;
|
||||||
|
|
||||||
import com.google.common.cache.Cache;
|
import com.google.common.cache.Cache;
|
||||||
import com.google.common.cache.CacheBuilder;
|
import com.google.common.cache.CacheBuilder;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.IndexQueryBuilder;
|
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.Query;
|
import nu.marginalia.wmsa.edge.index.reader.query.Query;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -105,10 +105,8 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
.mapToLong(idx -> idx.numUrls(word))
|
.mapToLong(idx -> idx.numUrls(word))
|
||||||
.sum()
|
.sum()
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public IndexBlock getBlockForResult(int searchTerm, long urlId) {
|
public IndexBlock getBlockForResult(int searchTerm, long urlId) {
|
||||||
for (var block : indicesBySearchOrder) {
|
for (var block : indicesBySearchOrder) {
|
||||||
var index = indices.get(block);
|
var index = indices.get(block);
|
@ -1,13 +1,13 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service;
|
package nu.marginalia.wmsa.edge.index.reader;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket;
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||||
|
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
|
||||||
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
|
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.query;
|
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||||
|
|
||||||
import com.google.common.collect.Streams;
|
import com.google.common.collect.Streams;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndex;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.query;
|
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||||
|
|
||||||
|
|
||||||
public class IndexSearchBudget {
|
public class IndexSearchBudget {
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.query;
|
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||||
|
|
||||||
import java.util.stream.LongStream;
|
import java.util.stream.LongStream;
|
||||||
|
|
@ -1,6 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service;
|
|
||||||
|
|
||||||
public enum SearchOrder {
|
|
||||||
ASCENDING,
|
|
||||||
REVERSED
|
|
||||||
}
|
|
@ -1,48 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index.wordstable;
|
|
||||||
|
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.RandomAccessFile;
|
|
||||||
import java.nio.channels.FileChannel;
|
|
||||||
import java.util.function.LongConsumer;
|
|
||||||
|
|
||||||
public abstract class IndexWordsTable implements AutoCloseable {
|
|
||||||
final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private static final int BUFFER_SIZE = 1024*1024*64;
|
|
||||||
|
|
||||||
public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException {
|
|
||||||
var wordsFile = openWordsFile(file);
|
|
||||||
long signature = wordsFile.get(0);
|
|
||||||
|
|
||||||
if (signature == Strategy.BTREE.ordinal()) {
|
|
||||||
return new BtreeWordsTable(wordsFile);
|
|
||||||
}
|
|
||||||
throw new IllegalArgumentException("Unknown signature " + signature);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException {
|
|
||||||
return new MultimapFileLong(wordsFile,
|
|
||||||
FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
public abstract long positionForWord(int wordId);
|
|
||||||
|
|
||||||
public abstract int wordLength(int wordId);
|
|
||||||
public abstract void forEachWordsOffset(LongConsumer offsetConsumer);
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws Exception {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public record TableWordRange(long start, long end) {}
|
|
||||||
|
|
||||||
public enum Strategy {
|
|
||||||
FLAT, HASH, BTREE_OLD, BTREE
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,85 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service.index.wordstable;
|
|
||||||
|
|
||||||
import nu.marginalia.util.btree.BTreeWriter;
|
|
||||||
import nu.marginalia.util.btree.model.BTreeContext;
|
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter.urlsBTreeContext;
|
|
||||||
|
|
||||||
public class WordsTableWriter {
|
|
||||||
private final long[] table;
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8);
|
|
||||||
|
|
||||||
public WordsTableWriter(int length) {
|
|
||||||
table = new long[length];
|
|
||||||
}
|
|
||||||
|
|
||||||
public void acceptWord(int wordId) {
|
|
||||||
if (wordId >= table.length) {
|
|
||||||
logger.warn("Invalid word-id {}", wordId);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
table[wordId]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public long[] getTable() {
|
|
||||||
return table;
|
|
||||||
}
|
|
||||||
public void write(File file) throws Exception {
|
|
||||||
|
|
||||||
int tableSize = 0;
|
|
||||||
|
|
||||||
if (table[0] != 0) tableSize = 1;
|
|
||||||
|
|
||||||
for (int i = 1; i < table.length; i++) {
|
|
||||||
if (table[i] != 0) {
|
|
||||||
tableSize++;
|
|
||||||
}
|
|
||||||
table[i] += table[i-1];
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Writing table {} words {} max", tableSize, table.length);
|
|
||||||
|
|
||||||
writeBtreeWordsFile(file, table, tableSize);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private void writeBtreeWordsFile(File outputFileWords, long[] table, int tableSize) throws Exception {
|
|
||||||
try (var mmf = MultimapFileLong.forOutput(outputFileWords.toPath(), tableSize/8L)) {
|
|
||||||
mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal());
|
|
||||||
long offset = 1;
|
|
||||||
|
|
||||||
var writer = new BTreeWriter(mmf, wordsBTreeContext);
|
|
||||||
|
|
||||||
writer.write(offset, tableSize, (idx) -> {
|
|
||||||
long urlFileOffset = 0;
|
|
||||||
|
|
||||||
if (table[0] != 0) {
|
|
||||||
int length = (int) table[0];
|
|
||||||
mmf.put(idx++, (long)length<<32);
|
|
||||||
mmf.put(idx++, 0);
|
|
||||||
|
|
||||||
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 1; i < table.length; i++) {
|
|
||||||
if (table[i] != table[i - 1]) {
|
|
||||||
int length = (int)(table[i] - table[i-1]);
|
|
||||||
mmf.put(idx++, (long)length << 32 | i);
|
|
||||||
mmf.put(idx++, urlFileOffset);
|
|
||||||
|
|
||||||
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.model.search;
|
|||||||
|
|
||||||
import lombok.*;
|
import lombok.*;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.service.SearchOrder;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -21,14 +20,13 @@ public class EdgeSearchSpecification {
|
|||||||
public final int limitTotal;
|
public final int limitTotal;
|
||||||
|
|
||||||
public final String humanQuery;
|
public final String humanQuery;
|
||||||
public final SearchOrder searchOrder;
|
|
||||||
public boolean stagger;
|
public boolean stagger;
|
||||||
public boolean experimental;
|
public boolean experimental;
|
||||||
|
|
||||||
public static EdgeSearchSpecification justIncludes(String... words) {
|
public static EdgeSearchSpecification justIncludes(String... words) {
|
||||||
return new EdgeSearchSpecification(
|
return new EdgeSearchSpecification(
|
||||||
IntStream.range(0, DYNAMIC_BUCKET_LENGTH+1).boxed().toList(),
|
IntStream.range(0, DYNAMIC_BUCKET_LENGTH+1).boxed().toList(),
|
||||||
Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", SearchOrder.ASCENDING, false, false);
|
Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
|||||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.service.SearchOrder;
|
|
||||||
import nu.marginalia.wmsa.edge.model.*;
|
import nu.marginalia.wmsa.edge.model.*;
|
||||||
import nu.marginalia.wmsa.edge.model.search.*;
|
import nu.marginalia.wmsa.edge.model.search.*;
|
||||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
|
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
|
||||||
@ -136,7 +135,7 @@ public class EdgeSearchOperator {
|
|||||||
|
|
||||||
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
|
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
|
||||||
|
|
||||||
EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", SearchOrder.ASCENDING, EdgeSearchProfile.YOLO.equals(profile), false);
|
EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", EdgeSearchProfile.YOLO.equals(profile), false);
|
||||||
|
|
||||||
return performQuery(ctx, new EdgeSearchQuery(specs), true);
|
return performQuery(ctx, new EdgeSearchQuery(specs), true);
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.wmsa.edge.search;
|
package nu.marginalia.wmsa.edge.search;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.service.SearchOrder;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -9,27 +8,27 @@ import java.util.List;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public enum EdgeSearchProfile {
|
public enum EdgeSearchProfile {
|
||||||
DEFAULT("default", SearchOrder.ASCENDING,
|
DEFAULT("default",
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||||
0, 1),
|
0, 1),
|
||||||
MODERN("modern", SearchOrder.ASCENDING,
|
MODERN("modern",
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||||
2),
|
2),
|
||||||
CORPO("corpo", SearchOrder.ASCENDING,
|
CORPO("corpo",
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||||
4, 5, 6, 7),
|
4, 5, 6, 7),
|
||||||
YOLO("yolo", SearchOrder.ASCENDING,
|
YOLO("yolo",
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||||
0, 2, 1, 3, 4, 6),
|
0, 2, 1, 3, 4, 6),
|
||||||
CORPO_CLEAN("corpo-clean", SearchOrder.ASCENDING,
|
CORPO_CLEAN("corpo-clean",
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||||
4, 5),
|
4, 5),
|
||||||
ACADEMIA("academia", SearchOrder.ASCENDING,
|
ACADEMIA("academia",
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||||
3),
|
3),
|
||||||
@ -37,17 +36,15 @@ public enum EdgeSearchProfile {
|
|||||||
|
|
||||||
|
|
||||||
public final String name;
|
public final String name;
|
||||||
public final SearchOrder order;
|
|
||||||
public final List<String> additionalSearchTerm;
|
public final List<String> additionalSearchTerm;
|
||||||
public final List<Integer> buckets;
|
public final List<Integer> buckets;
|
||||||
public final List<IndexBlock> indexBlocks;
|
public final List<IndexBlock> indexBlocks;
|
||||||
|
|
||||||
EdgeSearchProfile(String name, SearchOrder order,
|
EdgeSearchProfile(String name,
|
||||||
List<String> additionalSearchTerm,
|
List<String> additionalSearchTerm,
|
||||||
List<IndexBlock> indexBlocks,
|
List<IndexBlock> indexBlocks,
|
||||||
int... buckets) {
|
int... buckets) {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.order = order;
|
|
||||||
this.additionalSearchTerm = additionalSearchTerm;
|
this.additionalSearchTerm = additionalSearchTerm;
|
||||||
this.indexBlocks = indexBlocks;
|
this.indexBlocks = indexBlocks;
|
||||||
this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList());
|
this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList());
|
||||||
|
@ -32,7 +32,7 @@ import java.util.regex.Pattern;
|
|||||||
public class SiteSearchCommand implements SearchCommandInterface {
|
public class SiteSearchCommand implements SearchCommandInterface {
|
||||||
private final EdgeDataStoreDao dataStoreDao;
|
private final EdgeDataStoreDao dataStoreDao;
|
||||||
private final EdgeSearchOperator searchOperator;
|
private final EdgeSearchOperator searchOperator;
|
||||||
private DomainInformationService domainInformationService;
|
private final DomainInformationService domainInformationService;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final MustacheRenderer<DomainInformation> siteInfoRenderer;
|
private final MustacheRenderer<DomainInformation> siteInfoRenderer;
|
||||||
@ -91,7 +91,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
|
|||||||
|
|
||||||
logger.info("Fetching Site Info: {}", word);
|
logger.info("Fetching Site Info: {}", word);
|
||||||
var results = domainInformationService.domainInfo(word)
|
var results = domainInformationService.domainInfo(word)
|
||||||
.orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList()));
|
.orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList()));
|
||||||
|
|
||||||
logger.debug("Results = {}", results);
|
logger.debug("Results = {}", results);
|
||||||
|
|
||||||
|
@ -18,7 +18,6 @@ public class DomainInformation {
|
|||||||
int pagesIndexed;
|
int pagesIndexed;
|
||||||
int incomingLinks;
|
int incomingLinks;
|
||||||
int outboundLinks;
|
int outboundLinks;
|
||||||
double nominalQuality;
|
|
||||||
double ranking;
|
double ranking;
|
||||||
|
|
||||||
EdgeDomainIndexingState state;
|
EdgeDomainIndexingState state;
|
||||||
|
@ -138,7 +138,6 @@ public class QueryFactory {
|
|||||||
.subqueries(subqueries)
|
.subqueries(subqueries)
|
||||||
.limitByBucket(50)
|
.limitByBucket(50)
|
||||||
.limitTotal(100)
|
.limitTotal(100)
|
||||||
.searchOrder(profile.order)
|
|
||||||
.humanQuery(query)
|
.humanQuery(query)
|
||||||
.buckets(profile.buckets);
|
.buckets(profile.buckets);
|
||||||
|
|
||||||
|
@ -57,10 +57,9 @@ public class DomainInformationService {
|
|||||||
int outboundLinks = getOutboundLinks(domainId);
|
int outboundLinks = getOutboundLinks(domainId);
|
||||||
double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100;
|
double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100;
|
||||||
EdgeDomainIndexingState state = getDomainState(domainId);
|
EdgeDomainIndexingState state = getDomainState(domainId);
|
||||||
double nominalQuality = Math.round(100*100*Math.exp(getDomainQuality(domainId)))/100.;
|
|
||||||
List<EdgeDomain> linkingDomains = getLinkingDomains(domainId);
|
List<EdgeDomain> linkingDomains = getLinkingDomains(domainId);
|
||||||
|
|
||||||
return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains));
|
return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, rank, state, linkingDomains));
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgeId<EdgeDomain> getDomainFromPartial(String site) {
|
private EdgeId<EdgeDomain> getDomainFromPartial(String site) {
|
||||||
|
@ -8,8 +8,8 @@ import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
|||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||||
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexDao;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -10,5 +10,4 @@ Pages Known: {{pagesKnown}}
|
|||||||
Pages Indexed: {{pagesKnown}}
|
Pages Indexed: {{pagesKnown}}
|
||||||
Inbound Links: {{inboundLinks}}
|
Inbound Links: {{inboundLinks}}
|
||||||
Outbound Links: {{outboundLinks}}
|
Outbound Links: {{outboundLinks}}
|
||||||
Nominal Quality: {{nominalQuality}}%
|
|
||||||
Crawl Ranking: {{ranking}}%
|
Crawl Ranking: {{ranking}}%
|
@ -37,7 +37,6 @@
|
|||||||
<div class="card info">
|
<div class="card info">
|
||||||
<h2>Links</h2>
|
<h2>Links</h2>
|
||||||
<p class="description">
|
<p class="description">
|
||||||
Nominal Quality: {{nominalQuality}}%<br/>
|
|
||||||
Crawl Ranking: {{ranking}}%<br/>
|
Crawl Ranking: {{ranking}}%<br/>
|
||||||
Incoming Links: {{incomingLinks}} <br/>
|
Incoming Links: {{incomingLinks}} <br/>
|
||||||
Outbound Links: {{outboundLinks}} <br/>
|
Outbound Links: {{outboundLinks}} <br/>
|
||||||
|
@ -90,10 +90,10 @@ class BTreeWriterTest {
|
|||||||
|
|
||||||
{
|
{
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
var writer = new BTreeWriter(mmf, ctx);
|
||||||
writer.write(0, toPut.size(), (offset) -> {
|
writer.write(0, toPut.size(), (slice) -> {
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
mmf.put(offset + 2L*i, data[i]);
|
slice.put(2L*i, data[i]);
|
||||||
mmf.put(offset + 2L*i + 1, i);
|
slice.put( 2L*i + 1, i);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
mmf.force();
|
mmf.force();
|
||||||
@ -133,10 +133,10 @@ class BTreeWriterTest {
|
|||||||
|
|
||||||
{
|
{
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
var writer = new BTreeWriter(mmf, ctx);
|
||||||
writer.write( 0, toPut.size(), (offset) -> {
|
writer.write( 0, toPut.size(), (slice) -> {
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
mmf.put(offset + 2L*i, data[i]);
|
slice.put(2L*i, data[i]);
|
||||||
mmf.put(offset + 2L*i + 1, i);
|
slice.put(2L*i + 1, i);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
mmf.force();
|
mmf.force();
|
||||||
@ -182,9 +182,9 @@ class BTreeWriterTest {
|
|||||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
||||||
{
|
{
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
var writer = new BTreeWriter(mmf, ctx);
|
||||||
writer.write(0, toPut.size(), (offset) -> {
|
writer.write(0, toPut.size(), (slice) -> {
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
mmf.put(offset + i, data[i]);
|
slice.put(i, data[i]);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
mmf.force();
|
mmf.force();
|
||||||
@ -235,9 +235,9 @@ class BTreeWriterTest {
|
|||||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
||||||
{
|
{
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
var writer = new BTreeWriter(mmf, ctx);
|
||||||
writer.write(0, toPut.size(), (offset) -> {
|
writer.write(0, toPut.size(), (slice) -> {
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
mmf.put(offset + i, data[i]);
|
slice.put(i, data[i]);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
mmf.force();
|
mmf.force();
|
||||||
@ -288,10 +288,10 @@ class BTreeWriterTest {
|
|||||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
||||||
{
|
{
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
var writer = new BTreeWriter(mmf, ctx);
|
||||||
writer.write(0, toPut.size(), (offset) -> {
|
writer.write(0, toPut.size(), (slice) -> {
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
mmf.put(offset + i*2L, data[i]);
|
slice.put(i*2L, data[i]);
|
||||||
mmf.put(offset + i*2L+1, i);
|
slice.put(i*2L+1, i);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
mmf.force();
|
mmf.force();
|
||||||
|
@ -27,7 +27,7 @@ class LongPairHashMapTest {
|
|||||||
try {
|
try {
|
||||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
|
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
|
||||||
var lphm = new LongPairHashMap(mmf, 1024);
|
var lphm = LongPairHashMap.createNew(mmf, 1024);
|
||||||
toPut.forEach(i -> {
|
toPut.forEach(i -> {
|
||||||
lphm.put(new LongPairHashMap.CellData(i, i));
|
lphm.put(new LongPairHashMap.CellData(i, i));
|
||||||
});
|
});
|
||||||
@ -36,7 +36,7 @@ class LongPairHashMapTest {
|
|||||||
|
|
||||||
RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw");
|
RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||||
MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
|
MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
|
||||||
var lphm2 = new LongPairHashMap(mmf2);
|
var lphm2 = LongPairHashMap.loadExisting(mmf2);
|
||||||
toPut.forEach(i -> {
|
toPut.forEach(i -> {
|
||||||
Assertions.assertTrue(lphm2.get(i).isSet());
|
Assertions.assertTrue(lphm2.get(i).isSet());
|
||||||
Assertions.assertEquals(i, (int) lphm2.get(i).getKey());
|
Assertions.assertEquals(i, (int) lphm2.get(i).getKey());
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service;
|
package nu.marginalia.wmsa.edge.index.service;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
|
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
|
||||||
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
|
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@ -3,14 +3,14 @@ package nu.marginalia.wmsa.edge.index.service;
|
|||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.TestUtil;
|
import nu.marginalia.util.TestUtil;
|
||||||
import nu.marginalia.wmsa.client.exception.RemoteException;
|
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||||
import nu.marginalia.wmsa.edge.index.EdgeIndexService;
|
import nu.marginalia.wmsa.edge.index.EdgeIndexService;
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||||
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
@ -23,7 +23,6 @@ import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
|||||||
import org.junit.jupiter.api.parallel.ResourceLock;
|
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||||
import spark.Spark;
|
import spark.Spark;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -31,7 +30,6 @@ import java.util.List;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static nu.marginalia.util.TestUtil.getConnection;
|
import static nu.marginalia.util.TestUtil.getConnection;
|
||||||
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service;
|
package nu.marginalia.wmsa.edge.index.service;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
|
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndex;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
@ -13,6 +13,7 @@ class QueryVariantsTest {
|
|||||||
QueryVariants variants;
|
QueryVariants variants;
|
||||||
QueryParser parser;
|
QueryParser parser;
|
||||||
SentenceExtractor se;
|
SentenceExtractor se;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() {
|
public void setUp() {
|
||||||
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||||
@ -24,7 +25,7 @@ class QueryVariantsTest {
|
|||||||
parser = new QueryParser(new EnglishDictionary(dict), variants);
|
parser = new QueryParser(new EnglishDictionary(dict), variants);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test @SuppressWarnings("unchecked")
|
||||||
void getQueryVariants() {
|
void getQueryVariants() {
|
||||||
System.out.println(se.extractSentence("we are alone"));
|
System.out.println(se.extractSentence("we are alone"));
|
||||||
testCase("DOS", List.of("DOS"));
|
testCase("DOS", List.of("DOS"));
|
||||||
@ -50,7 +51,5 @@ class QueryVariantsTest {
|
|||||||
private void testCase(String input, List<String>... expected) {
|
private void testCase(String input, List<String>... expected) {
|
||||||
var tokens = variants.getQueryVariants(parser.extractBasicTokens(input));
|
var tokens = variants.getQueryVariants(parser.extractBasicTokens(input));
|
||||||
System.out.println(tokens);
|
System.out.println(tokens);
|
||||||
// var result = tokens.stream().map(lst -> lst.terms).collect(Collectors.toSet());
|
|
||||||
// assertEquals(Set.of(expected), result, "Case failed: " + input);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user