(keyword lexicon) Use three hash tables to increase the possible number of keywords to 2^31 from 0.75 x 2^30.

This commit is contained in:
Viktor Lofgren 2023-08-23 11:25:20 +02:00
parent bf92c270dc
commit 1a05cba60a
2 changed files with 39 additions and 7 deletions

View File

@ -3,27 +3,59 @@ package nu.marginalia.dict;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
public class OnHeapDictionaryMap implements DictionaryMap { public class OnHeapDictionaryMap implements DictionaryMap {
private static final int DEFAULT_SIZE = Integer.getInteger("lexiconSizeHint", 100_000); /* Use three different hash tables to get around the limitations of Java's array sizes.
private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(DEFAULT_SIZE, 0.75f); *
* Each map fits 0.75 * 2^30 entries (~800mn); the three maps together fit a bit over 2^31 entries.
* We're happy with 2^31.
*
* We'll assign each term to one of the three maps based on their modulo of 3. We'll pray each
* night that Long2IntOpenHashMap hash function is good enough to cope with this. The keys we are
* inserting are 64 bit hashes already, so odds are the rest of the bits have very good entropy.
*/
private static final int DEFAULT_SIZE = Integer.getInteger("lexiconSizeHint", 100_000)/3;
private final Long2IntOpenHashMap[] entries = new Long2IntOpenHashMap[3];
public OnHeapDictionaryMap() {
for (int i = 0; i < entries.length; i++) {
entries[i] = new Long2IntOpenHashMap(DEFAULT_SIZE, 0.75f);
}
}
@Override @Override
public void clear() { public void clear() {
entries.clear(); for (var map : entries) {
map.clear();
}
} }
@Override @Override
public int size() { public int size() {
return entries.size(); int totalSize = 0;
for (var map : entries) {
totalSize += map.size();
}
return totalSize;
} }
@Override @Override
public int put(long key) { public int put(long key) {
entries.putIfAbsent(key, entries.size()); int shardIdx = (int) Long.remainderUnsigned(key, 3);
var shard = entries[shardIdx];
int size = size();
if (size == Integer.MAX_VALUE)
throw new IllegalStateException("DictionaryMap is full");
shard.putIfAbsent(key, size);
return get(key); return get(key);
} }
@Override @Override
public int get(long key) { public int get(long key) {
return entries.getOrDefault(key, NO_VALUE); int shardIdx = (int) Long.remainderUnsigned(key, 3);
var shard = entries[shardIdx];
return shard.getOrDefault(key, NO_VALUE);
} }
} }

View File

@ -145,7 +145,7 @@ public class KeywordLexicon implements AutoCloseable {
} }
} }
public int size() { public long size() {
Lock lock = memoryLock.readLock(); Lock lock = memoryLock.readLock();
try { try {
lock.lock(); lock.lock();