(hash) Modified version of common codec's Murmur3 hash
This commit is contained in:
parent
8f0cbf267b
commit
86a5cc5c5f
@ -80,6 +80,7 @@ include 'third-party:openzim'
|
||||
include 'third-party:count-min-sketch'
|
||||
include 'third-party:monkey-patch-opennlp'
|
||||
include 'third-party:monkey-patch-gson'
|
||||
include 'third-party:commons-codec'
|
||||
|
||||
|
||||
dependencyResolutionManagement {
|
||||
@ -142,6 +143,7 @@ dependencyResolutionManagement {
|
||||
library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0')
|
||||
library('commons.compress','org.apache.commons','commons-compress').version('1.21')
|
||||
library('commons.io','commons-io','commons-io').version('2.11.0')
|
||||
library('commons.codec', 'commons-codec', 'commons-codec').version('1.16.0')
|
||||
|
||||
library('ffi','com.github.jnr','jnr-ffi').version('2.2.12')
|
||||
library('databind','com.fasterxml.jackson.core','jackson-databind').version('2.13.2.1')
|
||||
|
1
third-party/README.md
vendored
1
third-party/README.md
vendored
@ -10,6 +10,7 @@ or lack an artifact, or to override some default that is inappropriate for the t
|
||||
* [PorterStemmer](porterstemmer/) - LGPL3
|
||||
* [Uppend](uppend/) - MIT
|
||||
* [OpenZIM](openzim/) - GPL-2.0
|
||||
* [Commons Codec](commons-codec/) - Apache 2.0
|
||||
|
||||
### Repackaged
|
||||
* [SymSpell](symspell/) - LGPL-3.0
|
||||
|
20
third-party/commons-codec/build.gradle
vendored
Normal file
20
third-party/commons-codec/build.gradle
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
jmhImplementation project(':code:libraries:language-processing')
|
||||
jmhImplementation libs.guava
|
||||
jmhImplementation libs.commons.codec
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
34
third-party/commons-codec/readme.md
vendored
Normal file
34
third-party/commons-codec/readme.md
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
# Commons Codec
|
||||
|
||||
License: [APL 2.0](http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
This package contains a heavily modified version of the Murmur3 hash from [commons-codec](https://commons.apache.org/proper/commons-codec/)
|
||||
that cuts some corners but outperforms both Commons Codec and Guava fairly significantly for the particular use cases
|
||||
we care about being fast: Hashing ASCII/Latin1 strings into a well behaving 64-bit hash.
|
||||
|
||||
The method `hashLowerBytes(String data)` performs a zero allocation and zero conversion hash of
|
||||
the *lower bytes* of the characters in the provided string. For ASCII, Latin1, or other 8 bit encodings
|
||||
this is identical to hashing the entire string. For other use cases, especially away from the
|
||||
Latin scripts, this function is possibly a foot-gun.
|
||||
|
||||
The method `hashNearlyASCII(String data)` is the same as above, except it's
|
||||
seeded with Java String's hashCode(). This is a very non-standard modification that
|
||||
makes it a bit better at dealing with other encodings without measurable performance
|
||||
impact.
|
||||
|
||||
The method `long hash(byte[] data)` hashes the entire byte array.
|
||||
|
||||
A non-standard behavior is that the hash function folds the 128 bit
|
||||
hash into a 64 bit hash by xor:ing the 128 bit parts.
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
| Algorithm | Ops/s | Remark |
|
||||
|--------------------|-------------------|-----------------------------------------------------------------|
|
||||
| Guava | 12,114 ± 439 | allocates byte buffers internally |
|
||||
| Common Codec | 29,224 ± 1,080 | String.getByte() penalty, long\[2\] allocation, possibly elided |
|
||||
| MH hash | 30,885 ± 847 | String.getByte() penalty, zero allocations |
|
||||
| MH hashNearlyASCII | 50,018 ± 399 | Zero allocations, worse characteristics outside Latin1/ASCII |
|
||||
| MH hashLowerBytes | 50,533 ± 478 | Zero allocations, only works for Latin1/ASCII |
|
||||
| String.hashCode() | 567,381 ± 136,185 | Zero allocations, much weaker algo |
|
||||
|
105
third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java
vendored
Normal file
105
third-party/commons-codec/src/jmh/java/nu/marginalia/hash/MurmurHashBench.java
vendored
Normal file
@ -0,0 +1,105 @@
|
||||
package nu.marginalia.hash;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import org.apache.commons.codec.digest.MurmurHash3;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
public class MurmurHashBench {
|
||||
|
||||
private static HashFunction guavaHashFunction = Hashing.murmur3_128();
|
||||
private static MurmurHash3_128 marginaliahash = new MurmurHash3_128();
|
||||
|
||||
@State(Scope.Benchmark)
|
||||
public static class BenchState {
|
||||
|
||||
List<String> strings;
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void doSetup() {
|
||||
strings = new ArrayList<>();
|
||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
|
||||
"Could not load word frequency table");
|
||||
var br = new BufferedReader(new InputStreamReader(resource))
|
||||
) {
|
||||
for (;;) {
|
||||
String s = br.readLine();
|
||||
if (s == null) {
|
||||
break;
|
||||
}
|
||||
strings.add(s.toLowerCase());
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
public long benchGuava(BenchState state) {
|
||||
long total = 0;
|
||||
for (var string : state.strings) {
|
||||
total += guavaHashFunction.hashUnencodedChars(string).padToLong();
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
public long benchCommonCodec(BenchState state) {
|
||||
long total = 0;
|
||||
for (var string : state.strings) {
|
||||
total += MurmurHash3.hash128x64(string.getBytes(StandardCharsets.UTF_8))[0];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
public long benchMarginalia_hashNonStandardASCIIOnlyDirect(BenchState state) {
|
||||
long total = 0;
|
||||
for (var string : state.strings) {
|
||||
total += marginaliahash.hashLowerBytes(string);
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
public long benchMarginalia_hashStandard(BenchState state) {
|
||||
long total = 0;
|
||||
for (var string : state.strings) {
|
||||
total += marginaliahash.hash(string.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
public long benchJavaStringHash(BenchState state) {
|
||||
long total = 0;
|
||||
for (var string : state.strings) {
|
||||
total += string.hashCode();
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
public long benchWeakNonAscii(BenchState state) {
|
||||
long total = 0;
|
||||
for (var string : state.strings) {
|
||||
total += marginaliahash.hashNearlyASCII(string);
|
||||
}
|
||||
return total;
|
||||
}
|
||||
}
|
277
third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java
vendored
Normal file
277
third-party/commons-codec/src/main/java/nu/marginalia/hash/MurmurHash3_128.java
vendored
Normal file
@ -0,0 +1,277 @@
|
||||
package nu.marginalia.hash;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** A modified version of Commons Codec's murmur hash
|
||||
* that minimizes allocations.
|
||||
* */
|
||||
public class MurmurHash3_128 {
|
||||
|
||||
/**
|
||||
* A default seed to use for the murmur hash algorithm.
|
||||
* Has the value {@code 104729}.
|
||||
*/
|
||||
public static final int DEFAULT_SEED = 104729;
|
||||
|
||||
// Constants for 128-bit variant
|
||||
private static final long C1 = 0x87c37b91114253d5L;
|
||||
private static final long C2 = 0x4cf5ad432745937fL;
|
||||
private static final int R1 = 31;
|
||||
private static final int R2 = 27;
|
||||
private static final int R3 = 33;
|
||||
private static final int M = 5;
|
||||
private static final int N1 = 0x52dce729;
|
||||
private static final int N2 = 0x38495ab5;
|
||||
|
||||
/** Assumes data is ASCII, or at the very least that you only care about the lower
|
||||
* bytes of your string (which may be fine for hashing mostly latin script).
|
||||
* <p>
|
||||
* Fold the 128 bit hash into 64 bits by xor:ing msw and lsw
|
||||
*/
|
||||
public long hashLowerBytes(String data) {
|
||||
return hash64(data, 0, data.length(), DEFAULT_SEED);
|
||||
}
|
||||
|
||||
/** Like hashASCIIOnly except seeded with the Java String.hashCode()
|
||||
* to provide better behavior for non-ASCII strings. It's much worse
|
||||
* than doing it properly, but better than not doing this.
|
||||
*/
|
||||
public long hashNearlyASCII(String data) {
|
||||
return hash64(data, 0, data.length(), data.hashCode());
|
||||
}
|
||||
|
||||
/** Hash the bytes; fold the 128 bit hash into 64 bits by xor:ing msw and lsw */
|
||||
public long hash(byte[] data) {
|
||||
return hash64(data, 0, data.length, DEFAULT_SEED);
|
||||
}
|
||||
|
||||
private static long hash64(final CharSequence data, final int offset, final int length, final long seed) {
|
||||
long h1 = seed;
|
||||
long h2 = seed;
|
||||
final int nblocks = length >> 4;
|
||||
|
||||
// body
|
||||
for (int i = 0; i < nblocks; i++) {
|
||||
final int index = offset + (i << 4);
|
||||
long k1 = getLittleEndianLong(data, index);
|
||||
long k2 = getLittleEndianLong(data, index + 8);
|
||||
|
||||
// mix functions for k1
|
||||
k1 *= C1;
|
||||
k1 = Long.rotateLeft(k1, R1);
|
||||
k1 *= C2;
|
||||
h1 ^= k1;
|
||||
h1 = Long.rotateLeft(h1, R2);
|
||||
h1 += h2;
|
||||
h1 = h1 * M + N1;
|
||||
|
||||
// mix functions for k2
|
||||
k2 *= C2;
|
||||
k2 = Long.rotateLeft(k2, R3);
|
||||
k2 *= C1;
|
||||
h2 ^= k2;
|
||||
h2 = Long.rotateLeft(h2, R1);
|
||||
h2 += h1;
|
||||
h2 = h2 * M + N2;
|
||||
}
|
||||
|
||||
// tail
|
||||
long k1 = 0;
|
||||
long k2 = 0;
|
||||
final int index = offset + (nblocks << 4);
|
||||
switch (offset + length - index) {
|
||||
case 15:
|
||||
k2 ^= ((long) data.charAt(index + 14) & 0xff) << 48;
|
||||
case 14:
|
||||
k2 ^= ((long) data.charAt(index + 13) & 0xff) << 40;
|
||||
case 13:
|
||||
k2 ^= ((long) data.charAt(index + 12) & 0xff) << 32;
|
||||
case 12:
|
||||
k2 ^= ((long) data.charAt(index + 11) & 0xff) << 24;
|
||||
case 11:
|
||||
k2 ^= ((long) data.charAt(index + 10) & 0xff) << 16;
|
||||
case 10:
|
||||
k2 ^= ((long) data.charAt(index + 9) & 0xff) << 8;
|
||||
case 9:
|
||||
k2 ^= data.charAt(index + 8) & 0xff;
|
||||
k2 *= C2;
|
||||
k2 = Long.rotateLeft(k2, R3);
|
||||
k2 *= C1;
|
||||
h2 ^= k2;
|
||||
|
||||
case 8:
|
||||
k1 ^= ((long) data.charAt(index + 7) & 0xff) << 56;
|
||||
case 7:
|
||||
k1 ^= ((long) data.charAt(index + 6) & 0xff) << 48;
|
||||
case 6:
|
||||
k1 ^= ((long) data.charAt(index + 5) & 0xff) << 40;
|
||||
case 5:
|
||||
k1 ^= ((long) data.charAt(index + 4) & 0xff) << 32;
|
||||
case 4:
|
||||
k1 ^= ((long) data.charAt(index + 3) & 0xff) << 24;
|
||||
case 3:
|
||||
k1 ^= ((long) data.charAt(index + 2) & 0xff) << 16;
|
||||
case 2:
|
||||
k1 ^= ((long) data.charAt(index + 1) & 0xff) << 8;
|
||||
case 1:
|
||||
k1 ^= data.charAt(index) & 0xff;
|
||||
k1 *= C1;
|
||||
k1 = Long.rotateLeft(k1, R1);
|
||||
k1 *= C2;
|
||||
h1 ^= k1;
|
||||
}
|
||||
|
||||
// finalization
|
||||
h1 ^= length;
|
||||
h2 ^= length;
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
h1 = fmix64(h1);
|
||||
h2 = fmix64(h2);
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
return h1^h2; // non-standard 128->64 bit transformation
|
||||
}
|
||||
|
||||
private static long hash64(final byte[] data, final int offset, final int length, final long seed) {
|
||||
long h1 = seed;
|
||||
long h2 = seed;
|
||||
final int nblocks = length >> 4;
|
||||
|
||||
// body
|
||||
for (int i = 0; i < nblocks; i++) {
|
||||
final int index = offset + (i << 4);
|
||||
long k1 = getLittleEndianLong(data, index);
|
||||
long k2 = getLittleEndianLong(data, index + 8);
|
||||
|
||||
// mix functions for k1
|
||||
k1 *= C1;
|
||||
k1 = Long.rotateLeft(k1, R1);
|
||||
k1 *= C2;
|
||||
h1 ^= k1;
|
||||
h1 = Long.rotateLeft(h1, R2);
|
||||
h1 += h2;
|
||||
h1 = h1 * M + N1;
|
||||
|
||||
// mix functions for k2
|
||||
k2 *= C2;
|
||||
k2 = Long.rotateLeft(k2, R3);
|
||||
k2 *= C1;
|
||||
h2 ^= k2;
|
||||
h2 = Long.rotateLeft(h2, R1);
|
||||
h2 += h1;
|
||||
h2 = h2 * M + N2;
|
||||
}
|
||||
|
||||
// tail
|
||||
long k1 = 0;
|
||||
long k2 = 0;
|
||||
final int index = offset + (nblocks << 4);
|
||||
switch (offset + length - index) {
|
||||
case 15:
|
||||
k2 ^= ((long) data[index + 14] & 0xff) << 48;
|
||||
case 14:
|
||||
k2 ^= ((long) data[index + 13] & 0xff) << 40;
|
||||
case 13:
|
||||
k2 ^= ((long) data[index + 12] & 0xff) << 32;
|
||||
case 12:
|
||||
k2 ^= ((long) data[index + 11] & 0xff) << 24;
|
||||
case 11:
|
||||
k2 ^= ((long) data[index + 10] & 0xff) << 16;
|
||||
case 10:
|
||||
k2 ^= ((long) data[index + 9] & 0xff) << 8;
|
||||
case 9:
|
||||
k2 ^= data[index + 8] & 0xff;
|
||||
k2 *= C2;
|
||||
k2 = Long.rotateLeft(k2, R3);
|
||||
k2 *= C1;
|
||||
h2 ^= k2;
|
||||
|
||||
case 8:
|
||||
k1 ^= ((long) data[index + 7] & 0xff) << 56;
|
||||
case 7:
|
||||
k1 ^= ((long) data[index + 6] & 0xff) << 48;
|
||||
case 6:
|
||||
k1 ^= ((long) data[index + 5] & 0xff) << 40;
|
||||
case 5:
|
||||
k1 ^= ((long) data[index + 4] & 0xff) << 32;
|
||||
case 4:
|
||||
k1 ^= ((long) data[index + 3] & 0xff) << 24;
|
||||
case 3:
|
||||
k1 ^= ((long) data[index + 2] & 0xff) << 16;
|
||||
case 2:
|
||||
k1 ^= ((long) data[index + 1] & 0xff) << 8;
|
||||
case 1:
|
||||
k1 ^= data[index] & 0xff;
|
||||
k1 *= C1;
|
||||
k1 = Long.rotateLeft(k1, R1);
|
||||
k1 *= C2;
|
||||
h1 ^= k1;
|
||||
}
|
||||
|
||||
// finalization
|
||||
h1 ^= length;
|
||||
h2 ^= length;
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
h1 = fmix64(h1);
|
||||
h2 = fmix64(h2);
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
return h1^h2; // non-standard 128->64 bit transformation
|
||||
}
|
||||
|
||||
private static long getLittleEndianLong(final CharSequence data, final int index) {
|
||||
return (((long) data.charAt(index ) & 0xff) ) |
|
||||
(((long) data.charAt(index + 1) & 0xff) << 8) |
|
||||
(((long) data.charAt(index + 2) & 0xff) << 16) |
|
||||
(((long) data.charAt(index + 3) & 0xff) << 24) |
|
||||
(((long) data.charAt(index + 4) & 0xff) << 32) |
|
||||
(((long) data.charAt(index + 5) & 0xff) << 40) |
|
||||
(((long) data.charAt(index + 6) & 0xff) << 48) |
|
||||
(((long) data.charAt(index + 7) & 0xff) << 56);
|
||||
}
|
||||
|
||||
private static long getLittleEndianLong(final byte[] data, final int index) {
|
||||
return (((long) data[index ] & 0xff) ) |
|
||||
(((long) data[index + 1] & 0xff) << 8) |
|
||||
(((long) data[index + 2] & 0xff) << 16) |
|
||||
(((long) data[index + 3] & 0xff) << 24) |
|
||||
(((long) data[index + 4] & 0xff) << 32) |
|
||||
(((long) data[index + 5] & 0xff) << 40) |
|
||||
(((long) data[index + 6] & 0xff) << 48) |
|
||||
(((long) data[index + 7] & 0xff) << 56);
|
||||
}
|
||||
private static long fmix64(long hash) {
|
||||
hash ^= (hash >>> 33);
|
||||
hash *= 0xff51afd7ed558ccdL;
|
||||
hash ^= (hash >>> 33);
|
||||
hash *= 0xc4ceb9fe1a85ec53L;
|
||||
hash ^= (hash >>> 33);
|
||||
return hash;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user