(hash) Modified version of common codec's Murmur3 hash

2023-08-01 14:57:40 +02:00 · 2023-08-01 14:57:40 +02:00 · 86a5cc5c5f
commit 86a5cc5c5f
parent 8f0cbf267b
6 changed files with 439 additions and 0 deletions
--- a/settings.gradle
+++ b/settings.gradle
@ -80,6 +80,7 @@ include 'third-party:openzim'
 include 'third-party:count-min-sketch'
 include 'third-party:monkey-patch-opennlp'
 include 'third-party:monkey-patch-gson'
+include 'third-party:commons-codec'


 dependencyResolutionManagement {
@ -142,6 +143,7 @@ dependencyResolutionManagement {
            library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0')
            library('commons.compress','org.apache.commons','commons-compress').version('1.21')
            library('commons.io','commons-io','commons-io').version('2.11.0')
+            library('commons.codec', 'commons-codec', 'commons-codec').version('1.16.0')

            library('ffi','com.github.jnr','jnr-ffi').version('2.2.12')
            library('databind','com.fasterxml.jackson.core','jackson-databind').version('2.13.2.1')
--- a/third-party/README.md
+++ b/third-party/README.md
@ -10,6 +10,7 @@ or lack an artifact, or to override some default that is inappropriate for the t
 * [PorterStemmer](porterstemmer/) - LGPL3
 * [Uppend](uppend/) - MIT
 * [OpenZIM](openzim/) - GPL-2.0
+* [Commons Codec](commons-codec/) - Apache 2.0

 ### Repackaged
 * [SymSpell](symspell/) - LGPL-3.0
--- a/third-party/commons-codec/build.gradle
+++ b/third-party/commons-codec/build.gradle
@ -0,0 +1,20 @@
+plugins {
+    id 'java'
+    id "me.champeau.jmh" version "0.6.6"
+}
+
+java {
+    toolchain {
+        languageVersion.set(JavaLanguageVersion.of(17))
+    }
+}
+
+dependencies {
+    jmhImplementation project(':code:libraries:language-processing')
+    jmhImplementation libs.guava
+    jmhImplementation libs.commons.codec
+}
+
+test {
+    useJUnitPlatform()
+}
--- a/third-party/commons-codec/readme.md
+++ b/third-party/commons-codec/readme.md
@ -0,0 +1,34 @@
+# Commons Codec
+
+License: [APL 2.0](http://www.apache.org/licenses/LICENSE-2.0)
+
+This package contains a heavily modified version of the Murmur3 hash from [commons-codec](https://commons.apache.org/proper/commons-codec/)
+that cuts some corners but outperforms both Commons Codec and Guava fairly significantly for the particular use cases
+we care about being fast: Hashing ASCII/Latin1 strings into a well behaving 64-bit hash.
+
+The method `hashLowerBytes(String data)` performs a zero allocation and zero conversion hash of 
+the *lower bytes* of the characters in the provided string.  For ASCII, Latin1, or other 8 bit encodings 
+this is identical to hashing the entire string. For other use cases, especially away from the
+Latin scripts, this function is possibly a foot-gun.
+
+The method `hashNearlyASCII(String data)` is the same as above, except it's
+seeded with Java String's hashCode().  This is a very non-standard modification that
+makes it a bit better at dealing with other encodings without measurable performance
+impact.
+
+The method `long hash(byte[] data)` hashes the entire byte array.
+
+A non-standard behavior is that the hash function folds the 128 bit 
+hash into a 64 bit hash by xor:ing the 128 bit parts. 
+
+## Performance Benchmarks
+
+| Algorithm          | Ops/s             | Remark                                                          | 
+|--------------------|-------------------|-----------------------------------------------------------------|
+| Guava              | 12,114 ±  439     | allocates byte buffers internally                               |
+| Common Codec       | 29,224 ± 1,080    | String.getByte() penalty, long\[2\] allocation, possibly elided |
+| MH hash            | 30,885 ±  847     | String.getByte() penalty, zero allocations                      |
+| MH hashNearlyASCII | 50,018 ± 399      | Zero allocations, worse characteristics outside Latin1/ASCII    |
+| MH hashLowerBytes  | 50,533 ±  478     | Zero allocations, only works for Latin1/ASCII                   |
+| String.hashCode()  | 567,381 ± 136,185 | Zero allocations, much weaker algo                              |
+
--- a/third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java
+++ b/third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java
@ -0,0 +1,105 @@
+package nu.marginalia.hash;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import org.apache.commons.codec.digest.MurmurHash3;
+import org.openjdk.jmh.annotations.*;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+public class MurmurHashBench {
+
+    private static HashFunction guavaHashFunction = Hashing.murmur3_128();
+    private static MurmurHash3_128 marginaliahash = new MurmurHash3_128();
+
+    @State(Scope.Benchmark)
+    public static class BenchState {
+
+        List<String> strings;
+
+        @Setup(Level.Trial)
+        public void doSetup() {
+            strings = new ArrayList<>();
+            try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
+                    "Could not load word frequency table");
+                 var br = new BufferedReader(new InputStreamReader(resource))
+            ) {
+                for (;;) {
+                    String s = br.readLine();
+                    if (s == null) {
+                        break;
+                    }
+                    strings.add(s.toLowerCase());
+                }
+            }
+            catch (Exception ex) {
+                throw new RuntimeException(ex);
+            }
+        }
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public long benchGuava(BenchState state) {
+        long total = 0;
+        for (var string : state.strings) {
+            total += guavaHashFunction.hashUnencodedChars(string).padToLong();
+        }
+        return total;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public long benchCommonCodec(BenchState state) {
+        long total = 0;
+        for (var string : state.strings) {
+            total += MurmurHash3.hash128x64(string.getBytes(StandardCharsets.UTF_8))[0];
+        }
+        return total;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public long benchMarginalia_hashNonStandardASCIIOnlyDirect(BenchState state) {
+        long total = 0;
+        for (var string : state.strings) {
+            total += marginaliahash.hashLowerBytes(string);
+        }
+        return total;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public long benchMarginalia_hashStandard(BenchState state) {
+        long total = 0;
+        for (var string : state.strings) {
+            total += marginaliahash.hash(string.getBytes(StandardCharsets.UTF_8));
+        }
+        return total;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public long benchJavaStringHash(BenchState state) {
+        long total = 0;
+        for (var string : state.strings) {
+            total += string.hashCode();
+        }
+        return total;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public long benchWeakNonAscii(BenchState state) {
+        long total = 0;
+        for (var string : state.strings) {
+            total += marginaliahash.hashNearlyASCII(string);
+        }
+        return total;
+    }
+}
--- a/third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java
+++ b/third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java
@ -0,0 +1,277 @@
+package nu.marginalia.hash;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** A modified version of Commons Codec's murmur hash
+ *  that minimizes allocations.
+ * */
+public class MurmurHash3_128 {
+
+    /**
+     * A default seed to use for the murmur hash algorithm.
+     * Has the value {@code 104729}.
+     */
+    public static final int DEFAULT_SEED = 104729;
+
+    // Constants for 128-bit variant
+    private static final long C1 = 0x87c37b91114253d5L;
+    private static final long C2 = 0x4cf5ad432745937fL;
+    private static final int R1 = 31;
+    private static final int R2 = 27;
+    private static final int R3 = 33;
+    private static final int M = 5;
+    private static final int N1 = 0x52dce729;
+    private static final int N2 = 0x38495ab5;
+
+    /** Assumes data is ASCII, or at the very least that you only care about the lower
+     * bytes of your string (which may be fine for hashing mostly latin script).
+     * <p>
+     * Fold the 128 bit hash into 64 bits by xor:ing msw and lsw
+     */
+    public long hashLowerBytes(String data) {
+        return hash64(data, 0, data.length(), DEFAULT_SEED);
+    }
+
+    /** Like hashASCIIOnly except seeded with the Java String.hashCode()
+     * to provide better behavior for non-ASCII strings.  It's much worse
+     * than doing it properly, but better than not doing this.
+     */
+    public long hashNearlyASCII(String data) {
+        return hash64(data, 0, data.length(), data.hashCode());
+    }
+
+    /** Hash the bytes; fold the 128 bit hash into 64 bits by xor:ing msw and lsw */
+    public long hash(byte[] data) {
+        return hash64(data, 0, data.length, DEFAULT_SEED);
+    }
+
+    private static long hash64(final CharSequence data, final int offset, final int length, final long seed) {
+        long h1 = seed;
+        long h2 = seed;
+        final int nblocks = length >> 4;
+
+        // body
+        for (int i = 0; i < nblocks; i++) {
+            final int index = offset + (i << 4);
+            long k1 = getLittleEndianLong(data, index);
+            long k2 = getLittleEndianLong(data, index + 8);
+
+            // mix functions for k1
+            k1 *= C1;
+            k1 = Long.rotateLeft(k1, R1);
+            k1 *= C2;
+            h1 ^= k1;
+            h1 = Long.rotateLeft(h1, R2);
+            h1 += h2;
+            h1 = h1 * M + N1;
+
+            // mix functions for k2
+            k2 *= C2;
+            k2 = Long.rotateLeft(k2, R3);
+            k2 *= C1;
+            h2 ^= k2;
+            h2 = Long.rotateLeft(h2, R1);
+            h2 += h1;
+            h2 = h2 * M + N2;
+        }
+
+        // tail
+        long k1 = 0;
+        long k2 = 0;
+        final int index = offset + (nblocks << 4);
+        switch (offset + length - index) {
+            case 15:
+                k2 ^= ((long) data.charAt(index + 14) & 0xff) << 48;
+            case 14:
+                k2 ^= ((long) data.charAt(index + 13) & 0xff) << 40;
+            case 13:
+                k2 ^= ((long) data.charAt(index + 12) & 0xff) << 32;
+            case 12:
+                k2 ^= ((long) data.charAt(index + 11) & 0xff) << 24;
+            case 11:
+                k2 ^= ((long) data.charAt(index + 10) & 0xff) << 16;
+            case 10:
+                k2 ^= ((long) data.charAt(index + 9) & 0xff) << 8;
+            case 9:
+                k2 ^= data.charAt(index + 8) & 0xff;
+                k2 *= C2;
+                k2 = Long.rotateLeft(k2, R3);
+                k2 *= C1;
+                h2 ^= k2;
+
+            case 8:
+                k1 ^= ((long) data.charAt(index + 7) & 0xff) << 56;
+            case 7:
+                k1 ^= ((long) data.charAt(index + 6) & 0xff) << 48;
+            case 6:
+                k1 ^= ((long) data.charAt(index + 5) & 0xff) << 40;
+            case 5:
+                k1 ^= ((long) data.charAt(index + 4) & 0xff) << 32;
+            case 4:
+                k1 ^= ((long) data.charAt(index + 3) & 0xff) << 24;
+            case 3:
+                k1 ^= ((long) data.charAt(index + 2) & 0xff) << 16;
+            case 2:
+                k1 ^= ((long) data.charAt(index + 1) & 0xff) << 8;
+            case 1:
+                k1 ^= data.charAt(index) & 0xff;
+                k1 *= C1;
+                k1 = Long.rotateLeft(k1, R1);
+                k1 *= C2;
+                h1 ^= k1;
+        }
+
+        // finalization
+        h1 ^= length;
+        h2 ^= length;
+
+        h1 += h2;
+        h2 += h1;
+
+        h1 = fmix64(h1);
+        h2 = fmix64(h2);
+
+        h1 += h2;
+        h2 += h1;
+
+        return h1^h2; // non-standard 128->64 bit transformation
+    }
+
+    private static long hash64(final byte[] data, final int offset, final int length, final long seed) {
+        long h1 = seed;
+        long h2 = seed;
+        final int nblocks = length >> 4;
+
+        // body
+        for (int i = 0; i < nblocks; i++) {
+            final int index = offset + (i << 4);
+            long k1 = getLittleEndianLong(data, index);
+            long k2 = getLittleEndianLong(data, index + 8);
+
+            // mix functions for k1
+            k1 *= C1;
+            k1 = Long.rotateLeft(k1, R1);
+            k1 *= C2;
+            h1 ^= k1;
+            h1 = Long.rotateLeft(h1, R2);
+            h1 += h2;
+            h1 = h1 * M + N1;
+
+            // mix functions for k2
+            k2 *= C2;
+            k2 = Long.rotateLeft(k2, R3);
+            k2 *= C1;
+            h2 ^= k2;
+            h2 = Long.rotateLeft(h2, R1);
+            h2 += h1;
+            h2 = h2 * M + N2;
+        }
+
+        // tail
+        long k1 = 0;
+        long k2 = 0;
+        final int index = offset + (nblocks << 4);
+        switch (offset + length - index) {
+            case 15:
+                k2 ^= ((long) data[index + 14] & 0xff) << 48;
+            case 14:
+                k2 ^= ((long) data[index + 13] & 0xff) << 40;
+            case 13:
+                k2 ^= ((long) data[index + 12] & 0xff) << 32;
+            case 12:
+                k2 ^= ((long) data[index + 11] & 0xff) << 24;
+            case 11:
+                k2 ^= ((long) data[index + 10] & 0xff) << 16;
+            case 10:
+                k2 ^= ((long) data[index + 9] & 0xff) << 8;
+            case 9:
+                k2 ^= data[index + 8] & 0xff;
+                k2 *= C2;
+                k2 = Long.rotateLeft(k2, R3);
+                k2 *= C1;
+                h2 ^= k2;
+
+            case 8:
+                k1 ^= ((long) data[index + 7] & 0xff) << 56;
+            case 7:
+                k1 ^= ((long) data[index + 6] & 0xff) << 48;
+            case 6:
+                k1 ^= ((long) data[index + 5] & 0xff) << 40;
+            case 5:
+                k1 ^= ((long) data[index + 4] & 0xff) << 32;
+            case 4:
+                k1 ^= ((long) data[index + 3] & 0xff) << 24;
+            case 3:
+                k1 ^= ((long) data[index + 2] & 0xff) << 16;
+            case 2:
+                k1 ^= ((long) data[index + 1] & 0xff) << 8;
+            case 1:
+                k1 ^= data[index] & 0xff;
+                k1 *= C1;
+                k1 = Long.rotateLeft(k1, R1);
+                k1 *= C2;
+                h1 ^= k1;
+        }
+
+        // finalization
+        h1 ^= length;
+        h2 ^= length;
+
+        h1 += h2;
+        h2 += h1;
+
+        h1 = fmix64(h1);
+        h2 = fmix64(h2);
+
+        h1 += h2;
+        h2 += h1;
+
+        return h1^h2; // non-standard 128->64 bit transformation
+    }
+
+    private static long getLittleEndianLong(final CharSequence data, final int index) {
+        return (((long) data.charAt(index    ) & 0xff)      ) |
+                (((long) data.charAt(index + 1) & 0xff) <<  8) |
+                (((long) data.charAt(index + 2) & 0xff) << 16) |
+                (((long) data.charAt(index + 3) & 0xff) << 24) |
+                (((long) data.charAt(index + 4) & 0xff) << 32) |
+                (((long) data.charAt(index + 5) & 0xff) << 40) |
+                (((long) data.charAt(index + 6) & 0xff) << 48) |
+                (((long) data.charAt(index + 7) & 0xff) << 56);
+    }
+
+    private static long getLittleEndianLong(final byte[] data, final int index) {
+        return (((long) data[index    ] & 0xff)      ) |
+                (((long) data[index + 1] & 0xff) <<  8) |
+                (((long) data[index + 2] & 0xff) << 16) |
+                (((long) data[index + 3] & 0xff) << 24) |
+                (((long) data[index + 4] & 0xff) << 32) |
+                (((long) data[index + 5] & 0xff) << 40) |
+                (((long) data[index + 6] & 0xff) << 48) |
+                (((long) data[index + 7] & 0xff) << 56);
+    }
+    private static long fmix64(long hash) {
+        hash ^= (hash >>> 33);
+        hash *= 0xff51afd7ed558ccdL;
+        hash ^= (hash >>> 33);
+        hash *= 0xc4ceb9fe1a85ec53L;
+        hash ^= (hash >>> 33);
+        return hash;
+    }
+
+}