(hash) Modified version of common codec's Murmur3 hash

This commit is contained in:
Viktor Lofgren 2023-08-01 14:57:40 +02:00
parent 8f0cbf267b
commit 86a5cc5c5f
6 changed files with 439 additions and 0 deletions

View File

@ -80,6 +80,7 @@ include 'third-party:openzim'
include 'third-party:count-min-sketch'
include 'third-party:monkey-patch-opennlp'
include 'third-party:monkey-patch-gson'
include 'third-party:commons-codec'
dependencyResolutionManagement {
@ -142,6 +143,7 @@ dependencyResolutionManagement {
library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0')
library('commons.compress','org.apache.commons','commons-compress').version('1.21')
library('commons.io','commons-io','commons-io').version('2.11.0')
library('commons.codec', 'commons-codec', 'commons-codec').version('1.16.0')
library('ffi','com.github.jnr','jnr-ffi').version('2.2.12')
library('databind','com.fasterxml.jackson.core','jackson-databind').version('2.13.2.1')

View File

@ -10,6 +10,7 @@ or lack an artifact, or to override some default that is inappropriate for the t
* [PorterStemmer](porterstemmer/) - LGPL3
* [Uppend](uppend/) - MIT
* [OpenZIM](openzim/) - GPL-2.0
* [Commons Codec](commons-codec/) - Apache 2.0
### Repackaged
* [SymSpell](symspell/) - LGPL-3.0

20
third-party/commons-codec/build.gradle vendored Normal file
View File

@ -0,0 +1,20 @@
plugins {
id 'java'
id "me.champeau.jmh" version "0.6.6"
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
jmhImplementation project(':code:libraries:language-processing')
jmhImplementation libs.guava
jmhImplementation libs.commons.codec
}
test {
useJUnitPlatform()
}

34
third-party/commons-codec/readme.md vendored Normal file
View File

@ -0,0 +1,34 @@
# Commons Codec
License: [APL 2.0](http://www.apache.org/licenses/LICENSE-2.0)
This package contains a heavily modified version of the Murmur3 hash from [commons-codec](https://commons.apache.org/proper/commons-codec/)
that cuts some corners but outperforms both Commons Codec and Guava fairly significantly for the particular use cases
we care about being fast: Hashing ASCII/Latin1 strings into a well behaving 64-bit hash.
The method `hashLowerBytes(String data)` performs a zero allocation and zero conversion hash of
the *lower bytes* of the characters in the provided string. For ASCII, Latin1, or other 8 bit encodings
this is identical to hashing the entire string. For other use cases, especially away from the
Latin scripts, this function is possibly a foot-gun.
The method `hashNearlyASCII(String data)` is the same as above, except it's
seeded with Java String's hashCode(). This is a very non-standard modification that
makes it a bit better at dealing with other encodings without measurable performance
impact.
The method `long hash(byte[] data)` hashes the entire byte array.
A non-standard behavior is that the hash function folds the 128 bit
hash into a 64 bit hash by xor:ing the 128 bit parts.
## Performance Benchmarks
| Algorithm | Ops/s | Remark |
|--------------------|-------------------|-----------------------------------------------------------------|
| Guava | 12,114 ± 439 | allocates byte buffers internally |
| Common Codec | 29,224 ± 1,080 | String.getByte() penalty, long\[2\] allocation, possibly elided |
| MH hash | 30,885 ± 847 | String.getByte() penalty, zero allocations |
| MH hashNearlyASCII | 50,018 ± 399 | Zero allocations, worse characteristics outside Latin1/ASCII |
| MH hashLowerBytes | 50,533 ± 478 | Zero allocations, only works for Latin1/ASCII |
| String.hashCode() | 567,381 ± 136,185 | Zero allocations, much weaker algo |

View File

@ -0,0 +1,105 @@
package nu.marginalia.hash;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import org.apache.commons.codec.digest.MurmurHash3;
import org.openjdk.jmh.annotations.*;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
public class MurmurHashBench {
private static HashFunction guavaHashFunction = Hashing.murmur3_128();
private static MurmurHash3_128 marginaliahash = new MurmurHash3_128();
@State(Scope.Benchmark)
public static class BenchState {
List<String> strings;
@Setup(Level.Trial)
public void doSetup() {
strings = new ArrayList<>();
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
"Could not load word frequency table");
var br = new BufferedReader(new InputStreamReader(resource))
) {
for (;;) {
String s = br.readLine();
if (s == null) {
break;
}
strings.add(s.toLowerCase());
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
}
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
public long benchGuava(BenchState state) {
long total = 0;
for (var string : state.strings) {
total += guavaHashFunction.hashUnencodedChars(string).padToLong();
}
return total;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
public long benchCommonCodec(BenchState state) {
long total = 0;
for (var string : state.strings) {
total += MurmurHash3.hash128x64(string.getBytes(StandardCharsets.UTF_8))[0];
}
return total;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
public long benchMarginalia_hashNonStandardASCIIOnlyDirect(BenchState state) {
long total = 0;
for (var string : state.strings) {
total += marginaliahash.hashLowerBytes(string);
}
return total;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
public long benchMarginalia_hashStandard(BenchState state) {
long total = 0;
for (var string : state.strings) {
total += marginaliahash.hash(string.getBytes(StandardCharsets.UTF_8));
}
return total;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
public long benchJavaStringHash(BenchState state) {
long total = 0;
for (var string : state.strings) {
total += string.hashCode();
}
return total;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
public long benchWeakNonAscii(BenchState state) {
long total = 0;
for (var string : state.strings) {
total += marginaliahash.hashNearlyASCII(string);
}
return total;
}
}

View File

@ -0,0 +1,277 @@
package nu.marginalia.hash;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** A modified version of Commons Codec's murmur hash
* that minimizes allocations.
* */
public class MurmurHash3_128 {
/**
* A default seed to use for the murmur hash algorithm.
* Has the value {@code 104729}.
*/
public static final int DEFAULT_SEED = 104729;
// Constants for 128-bit variant
private static final long C1 = 0x87c37b91114253d5L;
private static final long C2 = 0x4cf5ad432745937fL;
private static final int R1 = 31;
private static final int R2 = 27;
private static final int R3 = 33;
private static final int M = 5;
private static final int N1 = 0x52dce729;
private static final int N2 = 0x38495ab5;
/** Assumes data is ASCII, or at the very least that you only care about the lower
* bytes of your string (which may be fine for hashing mostly latin script).
* <p>
* Fold the 128 bit hash into 64 bits by xor:ing msw and lsw
*/
public long hashLowerBytes(String data) {
return hash64(data, 0, data.length(), DEFAULT_SEED);
}
/** Like hashASCIIOnly except seeded with the Java String.hashCode()
* to provide better behavior for non-ASCII strings. It's much worse
* than doing it properly, but better than not doing this.
*/
public long hashNearlyASCII(String data) {
return hash64(data, 0, data.length(), data.hashCode());
}
/** Hash the bytes; fold the 128 bit hash into 64 bits by xor:ing msw and lsw */
public long hash(byte[] data) {
return hash64(data, 0, data.length, DEFAULT_SEED);
}
private static long hash64(final CharSequence data, final int offset, final int length, final long seed) {
long h1 = seed;
long h2 = seed;
final int nblocks = length >> 4;
// body
for (int i = 0; i < nblocks; i++) {
final int index = offset + (i << 4);
long k1 = getLittleEndianLong(data, index);
long k2 = getLittleEndianLong(data, index + 8);
// mix functions for k1
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
h1 ^= k1;
h1 = Long.rotateLeft(h1, R2);
h1 += h2;
h1 = h1 * M + N1;
// mix functions for k2
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;
h2 = Long.rotateLeft(h2, R1);
h2 += h1;
h2 = h2 * M + N2;
}
// tail
long k1 = 0;
long k2 = 0;
final int index = offset + (nblocks << 4);
switch (offset + length - index) {
case 15:
k2 ^= ((long) data.charAt(index + 14) & 0xff) << 48;
case 14:
k2 ^= ((long) data.charAt(index + 13) & 0xff) << 40;
case 13:
k2 ^= ((long) data.charAt(index + 12) & 0xff) << 32;
case 12:
k2 ^= ((long) data.charAt(index + 11) & 0xff) << 24;
case 11:
k2 ^= ((long) data.charAt(index + 10) & 0xff) << 16;
case 10:
k2 ^= ((long) data.charAt(index + 9) & 0xff) << 8;
case 9:
k2 ^= data.charAt(index + 8) & 0xff;
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;
case 8:
k1 ^= ((long) data.charAt(index + 7) & 0xff) << 56;
case 7:
k1 ^= ((long) data.charAt(index + 6) & 0xff) << 48;
case 6:
k1 ^= ((long) data.charAt(index + 5) & 0xff) << 40;
case 5:
k1 ^= ((long) data.charAt(index + 4) & 0xff) << 32;
case 4:
k1 ^= ((long) data.charAt(index + 3) & 0xff) << 24;
case 3:
k1 ^= ((long) data.charAt(index + 2) & 0xff) << 16;
case 2:
k1 ^= ((long) data.charAt(index + 1) & 0xff) << 8;
case 1:
k1 ^= data.charAt(index) & 0xff;
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
h1 ^= k1;
}
// finalization
h1 ^= length;
h2 ^= length;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
h2 += h1;
return h1^h2; // non-standard 128->64 bit transformation
}
private static long hash64(final byte[] data, final int offset, final int length, final long seed) {
long h1 = seed;
long h2 = seed;
final int nblocks = length >> 4;
// body
for (int i = 0; i < nblocks; i++) {
final int index = offset + (i << 4);
long k1 = getLittleEndianLong(data, index);
long k2 = getLittleEndianLong(data, index + 8);
// mix functions for k1
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
h1 ^= k1;
h1 = Long.rotateLeft(h1, R2);
h1 += h2;
h1 = h1 * M + N1;
// mix functions for k2
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;
h2 = Long.rotateLeft(h2, R1);
h2 += h1;
h2 = h2 * M + N2;
}
// tail
long k1 = 0;
long k2 = 0;
final int index = offset + (nblocks << 4);
switch (offset + length - index) {
case 15:
k2 ^= ((long) data[index + 14] & 0xff) << 48;
case 14:
k2 ^= ((long) data[index + 13] & 0xff) << 40;
case 13:
k2 ^= ((long) data[index + 12] & 0xff) << 32;
case 12:
k2 ^= ((long) data[index + 11] & 0xff) << 24;
case 11:
k2 ^= ((long) data[index + 10] & 0xff) << 16;
case 10:
k2 ^= ((long) data[index + 9] & 0xff) << 8;
case 9:
k2 ^= data[index + 8] & 0xff;
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;
case 8:
k1 ^= ((long) data[index + 7] & 0xff) << 56;
case 7:
k1 ^= ((long) data[index + 6] & 0xff) << 48;
case 6:
k1 ^= ((long) data[index + 5] & 0xff) << 40;
case 5:
k1 ^= ((long) data[index + 4] & 0xff) << 32;
case 4:
k1 ^= ((long) data[index + 3] & 0xff) << 24;
case 3:
k1 ^= ((long) data[index + 2] & 0xff) << 16;
case 2:
k1 ^= ((long) data[index + 1] & 0xff) << 8;
case 1:
k1 ^= data[index] & 0xff;
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
h1 ^= k1;
}
// finalization
h1 ^= length;
h2 ^= length;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
h2 += h1;
return h1^h2; // non-standard 128->64 bit transformation
}
private static long getLittleEndianLong(final CharSequence data, final int index) {
return (((long) data.charAt(index ) & 0xff) ) |
(((long) data.charAt(index + 1) & 0xff) << 8) |
(((long) data.charAt(index + 2) & 0xff) << 16) |
(((long) data.charAt(index + 3) & 0xff) << 24) |
(((long) data.charAt(index + 4) & 0xff) << 32) |
(((long) data.charAt(index + 5) & 0xff) << 40) |
(((long) data.charAt(index + 6) & 0xff) << 48) |
(((long) data.charAt(index + 7) & 0xff) << 56);
}
private static long getLittleEndianLong(final byte[] data, final int index) {
return (((long) data[index ] & 0xff) ) |
(((long) data[index + 1] & 0xff) << 8) |
(((long) data[index + 2] & 0xff) << 16) |
(((long) data[index + 3] & 0xff) << 24) |
(((long) data[index + 4] & 0xff) << 32) |
(((long) data[index + 5] & 0xff) << 40) |
(((long) data[index + 6] & 0xff) << 48) |
(((long) data[index + 7] & 0xff) << 56);
}
private static long fmix64(long hash) {
hash ^= (hash >>> 33);
hash *= 0xff51afd7ed558ccdL;
hash ^= (hash >>> 33);
hash *= 0xc4ceb9fe1a85ec53L;
hash ^= (hash >>> 33);
return hash;
}
}