HyperLogLog-tool for figuring out how big the index is.
This commit is contained in:
parent
17226bc4fd
commit
6df02f7528
@ -158,6 +158,8 @@ dependencies {
|
||||
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
||||
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
||||
|
||||
implementation 'net.agkn:hll:1.6.0'
|
||||
|
||||
}
|
||||
|
||||
configurations {
|
||||
|
@ -82,7 +82,8 @@ public enum ServiceDescriptor {
|
||||
new ConvertCommand(),
|
||||
new LoadCommand(),
|
||||
new ReindexCommand(),
|
||||
new VersionCommand()
|
||||
new VersionCommand(),
|
||||
new IndexDataDumpCommand()
|
||||
).collect(Collectors.toMap(c -> c.name, c -> c));
|
||||
|
||||
if(args.length > 0) {
|
||||
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.wmsa.configuration.command;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.tools.IndexJournalDumpTool;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class IndexDataDumpCommand extends Command {
|
||||
public IndexDataDumpCommand() {
|
||||
super("index-dump");
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void execute(String... args) {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Usage: index-dump [sub-command] index.dat");
|
||||
System.exit(255);
|
||||
}
|
||||
|
||||
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
|
||||
IndexJournalDumpTool.main(args2);
|
||||
}
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import net.agkn.hll.HLL;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
|
||||
|
||||
@ -8,7 +10,36 @@ import java.nio.file.Path;
|
||||
|
||||
public class IndexJournalDumpTool {
|
||||
public static void main(String... args) throws IOException {
|
||||
var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(Path.of(args[0])));
|
||||
final String operation = args.length > 0 ? args[0] : "help";
|
||||
|
||||
switch (operation) {
|
||||
case "dump":
|
||||
dump(Path.of(args[1]));
|
||||
break;
|
||||
case "cardinality":
|
||||
cardinality(Path.of(args[1]));
|
||||
break;
|
||||
default:
|
||||
System.err.println("Usage: dump|cardinality index-file.dat");
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void cardinality(Path file) throws IOException {
|
||||
var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file));
|
||||
HLL hyperloglog = new HLL(30, 1);
|
||||
var hashFunction = Hashing.murmur3_128();
|
||||
|
||||
for (var entry : reader) {
|
||||
hyperloglog.addRaw(hashFunction.hashLong(entry.docId()).padToLong());
|
||||
}
|
||||
|
||||
System.out.println(hyperloglog.cardinality());
|
||||
}
|
||||
|
||||
private static void dump(Path file) throws IOException {
|
||||
var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file));
|
||||
for (var entry : reader) {
|
||||
System.out.printf("%s\t%010d\t%06d:%08d\n", entry.block(), entry.docId(), entry.domainId(), entry.urlId());
|
||||
}
|
||||
|
@ -36,6 +36,7 @@ class QueryVariantsTest {
|
||||
@Test
|
||||
void getQueryVariants() {
|
||||
System.out.println(se.extractSentence("we are alone"));
|
||||
testCase("inside job reviews");
|
||||
testCase("DOS");
|
||||
testCase("dos");
|
||||
testCase("we are alone");
|
||||
|
Loading…
Reference in New Issue
Block a user