From 1a05cba60acf2df3449e4c0fae7cada57a13b572 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 23 Aug 2023 11:25:20 +0200 Subject: [PATCH] (keyword lexicon) Use three hash tables to increase the possible number of keywords to 2^31 from 0.75 x 2^30. --- .../marginalia/dict/OnHeapDictionaryMap.java | 44 ++++++++++++++++--- .../nu/marginalia/lexicon/KeywordLexicon.java | 2 +- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java index 96dd5d13..4662cd5c 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java @@ -3,27 +3,59 @@ package nu.marginalia.dict; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; public class OnHeapDictionaryMap implements DictionaryMap { - private static final int DEFAULT_SIZE = Integer.getInteger("lexiconSizeHint", 100_000); - private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(DEFAULT_SIZE, 0.75f); + /* Use three different hash tables to get around the limitations of Java's array sizes. + * + * Each map fits 0.75 * 2^30 entries (~800mn); the three maps together fit a bit over 2^31 entries. + * We're happy with 2^31. + * + * We'll assign each term to one of the three maps based on their modulo of 3. We'll pray each + * night that Long2IntOpenHashMap hash function is good enough to cope with this. The keys we are + * inserting are 64 bit hashes already, so odds are the rest of the bits have very good entropy. + */ + private static final int DEFAULT_SIZE = Integer.getInteger("lexiconSizeHint", 100_000)/3; + private final Long2IntOpenHashMap[] entries = new Long2IntOpenHashMap[3]; + + public OnHeapDictionaryMap() { + for (int i = 0; i < entries.length; i++) { + entries[i] = new Long2IntOpenHashMap(DEFAULT_SIZE, 0.75f); + } + } @Override public void clear() { - entries.clear(); + for (var map : entries) { + map.clear(); + } } @Override public int size() { - return entries.size(); + int totalSize = 0; + for (var map : entries) { + totalSize += map.size(); + } + return totalSize; } @Override public int put(long key) { - entries.putIfAbsent(key, entries.size()); + int shardIdx = (int) Long.remainderUnsigned(key, 3); + var shard = entries[shardIdx]; + int size = size(); + + if (size == Integer.MAX_VALUE) + throw new IllegalStateException("DictionaryMap is full"); + + shard.putIfAbsent(key, size); + return get(key); } @Override public int get(long key) { - return entries.getOrDefault(key, NO_VALUE); + int shardIdx = (int) Long.remainderUnsigned(key, 3); + var shard = entries[shardIdx]; + + return shard.getOrDefault(key, NO_VALUE); } } diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java index 84507511..9132f151 100644 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java +++ b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java @@ -145,7 +145,7 @@ public class KeywordLexicon implements AutoCloseable { } } - public int size() { + public long size() { Lock lock = memoryLock.readLock(); try { lock.lock();