HyperLogLog-tool for figuring out how big the index is.

This commit is contained in:
vlofgren 2022-09-13 18:27:32 +02:00
parent 17226bc4fd
commit 6df02f7528
5 changed files with 61 additions and 2 deletions

View File

@ -158,6 +158,8 @@ dependencies {
jmh 'org.openjdk.jmh:jmh-core:1.35'
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
implementation 'net.agkn:hll:1.6.0'
}
configurations {

View File

@ -82,7 +82,8 @@ public enum ServiceDescriptor {
new ConvertCommand(),
new LoadCommand(),
new ReindexCommand(),
new VersionCommand()
new VersionCommand(),
new IndexDataDumpCommand()
).collect(Collectors.toMap(c -> c.name, c -> c));
if(args.length > 0) {

View File

@ -0,0 +1,24 @@
package nu.marginalia.wmsa.configuration.command;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.tools.IndexJournalDumpTool;
import java.util.Arrays;
public class IndexDataDumpCommand extends Command {
public IndexDataDumpCommand() {
super("index-dump");
}
@SneakyThrows
@Override
public void execute(String... args) {
if (args.length < 1) {
System.err.println("Usage: index-dump [sub-command] index.dat");
System.exit(255);
}
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
IndexJournalDumpTool.main(args2);
}
}

View File

@ -1,5 +1,7 @@
package nu.marginalia.wmsa.edge.tools;
import com.google.common.hash.Hashing;
import net.agkn.hll.HLL;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
@ -8,7 +10,36 @@ import java.nio.file.Path;
public class IndexJournalDumpTool {
public static void main(String... args) throws IOException {
var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(Path.of(args[0])));
final String operation = args.length > 0 ? args[0] : "help";
switch (operation) {
case "dump":
dump(Path.of(args[1]));
break;
case "cardinality":
cardinality(Path.of(args[1]));
break;
default:
System.err.println("Usage: dump|cardinality index-file.dat");
break;
}
}
private static void cardinality(Path file) throws IOException {
var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file));
HLL hyperloglog = new HLL(30, 1);
var hashFunction = Hashing.murmur3_128();
for (var entry : reader) {
hyperloglog.addRaw(hashFunction.hashLong(entry.docId()).padToLong());
}
System.out.println(hyperloglog.cardinality());
}
private static void dump(Path file) throws IOException {
var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file));
for (var entry : reader) {
System.out.printf("%s\t%010d\t%06d:%08d\n", entry.block(), entry.docId(), entry.domainId(), entry.urlId());
}

View File

@ -36,6 +36,7 @@ class QueryVariantsTest {
@Test
void getQueryVariants() {
System.out.println(se.extractSentence("we are alone"));
testCase("inside job reviews");
testCase("DOS");
testCase("dos");
testCase("we are alone");