001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.digest;
019
020import org.apache.commons.codec.binary.StringUtils;
021
022/**
023 * Implementation of the MurmurHash3 32-bit and 128-bit hash functions.
024 *
025 * <p>
026 * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic
027 * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not
028 * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes.
029 * </p>
030 *
031 * <p>
032 * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function
033 * {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher.
034 * </p>
035 *
036 * <p>
037 * This is public domain code with no copyrights. From home page of
038 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:
039 * </p>
040 *
041 * <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their
042 * code." </blockquote>
043 *
044 * <p>
045 * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original
046 * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a
047 * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes.
048 * </p>
049 *
050 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
051 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++
052 *      code</a>
053 * @see <a href=
054 *      "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java">
055 *      Apache Hive Murmer3</a>
056 * @since 1.13
057 */
058public final class MurmurHash3 {
059
060    /**
061     * A random number to use for a hash code.
062     *
063     * @deprecated This is not used internally and will be removed in a future release.
064     */
065    @Deprecated
066    public static final long NULL_HASHCODE = 2862933555777941757L;
067
068    /**
069     * A default seed to use for the murmur hash algorithm.
070     * Has the value {@code 104729}.
071     */
072    public static final int DEFAULT_SEED = 104729;
073
074    // Constants for 32-bit variant
075    private static final int C1_32 = 0xcc9e2d51;
076    private static final int C2_32 = 0x1b873593;
077    private static final int R1_32 = 15;
078    private static final int R2_32 = 13;
079    private static final int M_32 = 5;
080    private static final int N_32 = 0xe6546b64;
081
082    // Constants for 128-bit variant
083    private static final long C1 = 0x87c37b91114253d5L;
084    private static final long C2 = 0x4cf5ad432745937fL;
085    private static final int R1 = 31;
086    private static final int R2 = 27;
087    private static final int R3 = 33;
088    private static final int M = 5;
089    private static final int N1 = 0x52dce729;
090    private static final int N2 = 0x38495ab5;
091
092    /** No instance methods. */
093    private MurmurHash3() {
094    }
095
096    /**
097     * Generates 32-bit hash from two longs with a default seed value.
098     * This is a helper method that will produce the same result as:
099     *
100     * <pre>
101     * int offset = 0;
102     * int seed = 104729;
103     * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16)
104     *                                            .putLong(data1)
105     *                                            .putLong(data2)
106     *                                            .array(), offset, 16, seed);
107     * </pre>
108     *
109     * @param data1 The first long to hash
110     * @param data2 The second long to hash
111     * @return The 32-bit hash
112     * @see #hash32x86(byte[], int, int, int)
113     */
114    public static int hash32(final long data1, final long data2) {
115        return hash32(data1, data2, DEFAULT_SEED);
116    }
117
118    /**
119     * Generates 32-bit hash from two longs with the given seed.
120     * This is a helper method that will produce the same result as:
121     *
122     * <pre>
123     * int offset = 0;
124     * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16)
125     *                                            .putLong(data1)
126     *                                            .putLong(data2)
127     *                                            .array(), offset, 16, seed);
128     * </pre>
129     *
130     * @param data1 The first long to hash
131     * @param data2 The second long to hash
132     * @param seed The initial seed value
133     * @return The 32-bit hash
134     * @see #hash32x86(byte[], int, int, int)
135     */
136    public static int hash32(final long data1, final long data2, final int seed) {
137        int hash = seed;
138        final long r0 = Long.reverseBytes(data1);
139        final long r1 = Long.reverseBytes(data2);
140
141        hash = mix32((int) r0, hash);
142        hash = mix32((int) (r0 >>> 32), hash);
143        hash = mix32((int) (r1), hash);
144        hash = mix32((int) (r1 >>> 32), hash);
145
146        hash ^= Long.BYTES * 2;
147        return fmix32(hash);
148    }
149
150    /**
151     * Generates 32-bit hash from a long with a default seed value.
152     * This is a helper method that will produce the same result as:
153     *
154     * <pre>
155     * int offset = 0;
156     * int seed = 104729;
157     * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8)
158     *                                            .putLong(data)
159     *                                            .array(), offset, 8, seed);
160     * </pre>
161     *
162     * @param data The long to hash
163     * @return The 32-bit hash
164     * @see #hash32x86(byte[], int, int, int)
165     */
166    public static int hash32(final long data) {
167        return hash32(data, DEFAULT_SEED);
168    }
169
170    /**
171     * Generates 32-bit hash from a long with the given seed.
172     * This is a helper method that will produce the same result as:
173     *
174     * <pre>
175     * int offset = 0;
176     * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8)
177     *                                            .putLong(data)
178     *                                            .array(), offset, 8, seed);
179     * </pre>
180     *
181     * @param data The long to hash
182     * @param seed The initial seed value
183     * @return The 32-bit hash
184     * @see #hash32x86(byte[], int, int, int)
185     */
186    public static int hash32(final long data, final int seed) {
187        int hash = seed;
188        final long r0 = Long.reverseBytes(data);
189
190        hash = mix32((int) r0, hash);
191        hash = mix32((int) (r0 >>> 32), hash);
192
193        hash ^= Long.BYTES;
194        return fmix32(hash);
195    }
196
197    /**
198     * Generates 32-bit hash from the byte array with a default seed.
199     * This is a helper method that will produce the same result as:
200     *
201     * <pre>
202     * int offset = 0;
203     * int seed = 104729;
204     * int hash = MurmurHash3.hash32(data, offset, data.length, seed);
205     * </pre>
206     *
207     * <p>This implementation contains a sign-extension bug in the finalization step of
208     * any bytes left over from dividing the length by 4. This manifests if any of these
209     * bytes are negative.</p>
210     *
211     * @param data The input byte array
212     * @return The 32-bit hash
213     * @see #hash32(byte[], int, int, int)
214     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
215     */
216    @Deprecated
217    public static int hash32(final byte[] data) {
218        return hash32(data, 0, data.length, DEFAULT_SEED);
219    }
220
221    /**
222     * Generates 32-bit hash from a string with a default seed.
223     * <p>
224     * Before 1.14 the string was converted using default encoding.
225     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
226     * </p>
227     * This is a helper method that will produce the same result as:
228     *
229     * <pre>
230     * int offset = 0;
231     * int seed = 104729;
232     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
233     * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed);
234     * </pre>
235     *
236     * <p>This implementation contains a sign-extension bug in the finalization step of
237     * any bytes left over from dividing the length by 4. This manifests if any of these
238     * bytes are negative.</p>
239     *
240     * @param data The input string
241     * @return The 32-bit hash
242     * @see #hash32(byte[], int, int, int)
243     * @deprecated Use {@link #hash32x86(byte[], int, int, int)} with the bytes returned from
244     * {@link String#getBytes(java.nio.charset.Charset)}. This corrects the processing of trailing bytes.
245     */
246    @Deprecated
247    public static int hash32(final String data) {
248        final byte[] bytes = StringUtils.getBytesUtf8(data);
249        return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
250    }
251
252    /**
253     * Generates 32-bit hash from the byte array with the given length and a default seed.
254     * This is a helper method that will produce the same result as:
255     *
256     * <pre>
257     * int offset = 0;
258     * int seed = 104729;
259     * int hash = MurmurHash3.hash32(data, offset, length, seed);
260     * </pre>
261     *
262     * <p>This implementation contains a sign-extension bug in the finalization step of
263     * any bytes left over from dividing the length by 4. This manifests if any of these
264     * bytes are negative.</p>
265     *
266     * @param data The input byte array
267     * @param length The length of array
268     * @return The 32-bit hash
269     * @see #hash32(byte[], int, int, int)
270     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
271     */
272    @Deprecated
273    public static int hash32(final byte[] data, final int length) {
274        return hash32(data, length, DEFAULT_SEED);
275    }
276
277    /**
278     * Generates 32-bit hash from the byte array with the given length and seed. This is a
279     * helper method that will produce the same result as:
280     *
281     * <pre>
282     * int offset = 0;
283     * int hash = MurmurHash3.hash32(data, offset, length, seed);
284     * </pre>
285     *
286     * <p>This implementation contains a sign-extension bug in the finalization step of
287     * any bytes left over from dividing the length by 4. This manifests if any of these
288     * bytes are negative.</p>
289     *
290     * @param data The input byte array
291     * @param length The length of array
292     * @param seed The initial seed value
293     * @return The 32-bit hash
294     * @see #hash32(byte[], int, int, int)
295     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
296     */
297    @Deprecated
298    public static int hash32(final byte[] data, final int length, final int seed) {
299        return hash32(data, 0, length, seed);
300    }
301
302    /**
303     * Generates 32-bit hash from the byte array with the given offset, length and seed.
304     *
305     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
306     * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
307     *
308     * <p>This implementation contains a sign-extension bug in the finalization step of
309     * any bytes left over from dividing the length by 4. This manifests if any of these
310     * bytes are negative.</p>
311     *
312     * @param data The input byte array
313     * @param offset The offset of data
314     * @param length The length of array
315     * @param seed The initial seed value
316     * @return The 32-bit hash
317     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
318     */
319    @Deprecated
320    public static int hash32(final byte[] data, final int offset, final int length, final int seed) {
321        int hash = seed;
322        final int nblocks = length >> 2;
323
324        // body
325        for (int i = 0; i < nblocks; i++) {
326            final int index = offset + (i << 2);
327            final int k = getLittleEndianInt(data, index);
328            hash = mix32(k, hash);
329        }
330
331        // tail
332        // ************
333        // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
334        // ************
335        final int index = offset + (nblocks << 2);
336        int k1 = 0;
337        switch (offset + length - index) {
338        case 3:
339            k1 ^= data[index + 2] << 16;
340        case 2:
341            k1 ^= data[index + 1] << 8;
342        case 1:
343            k1 ^= data[index];
344
345            // mix functions
346            k1 *= C1_32;
347            k1 = Integer.rotateLeft(k1, R1_32);
348            k1 *= C2_32;
349            hash ^= k1;
350        }
351
352        hash ^= length;
353        return fmix32(hash);
354    }
355
356    /**
357     * Generates 32-bit hash from the byte array with a seed of zero.
358     * This is a helper method that will produce the same result as:
359     *
360     * <pre>
361     * int offset = 0;
362     * int seed = 0;
363     * int hash = MurmurHash3.hash32x86(data, offset, data.length, seed);
364     * </pre>
365     *
366     * @param data The input byte array
367     * @return The 32-bit hash
368     * @see #hash32x86(byte[], int, int, int)
369     * @since 1.14
370     */
371    public static int hash32x86(final byte[] data) {
372        return hash32x86(data, 0, data.length, 0);
373    }
374
375    /**
376     * Generates 32-bit hash from the byte array with the given offset, length and seed.
377     *
378     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
379     * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
380     *
381     * @param data The input byte array
382     * @param offset The offset of data
383     * @param length The length of array
384     * @param seed The initial seed value
385     * @return The 32-bit hash
386     * @since 1.14
387     */
388    public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) {
389        int hash = seed;
390        final int nblocks = length >> 2;
391
392        // body
393        for (int i = 0; i < nblocks; i++) {
394            final int index = offset + (i << 2);
395            final int k = getLittleEndianInt(data, index);
396            hash = mix32(k, hash);
397        }
398
399        // tail
400        final int index = offset + (nblocks << 2);
401        int k1 = 0;
402        switch (offset + length - index) {
403        case 3:
404            k1 ^= (data[index + 2] & 0xff) << 16;
405        case 2:
406            k1 ^= (data[index + 1] & 0xff) << 8;
407        case 1:
408            k1 ^= (data[index] & 0xff);
409
410            // mix functions
411            k1 *= C1_32;
412            k1 = Integer.rotateLeft(k1, R1_32);
413            k1 *= C2_32;
414            hash ^= k1;
415        }
416
417        hash ^= length;
418        return fmix32(hash);
419    }
420
421    /**
422     * Generates 64-bit hash from a long with a default seed.
423     *
424     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
425     *
426     * <p>This is a Murmur3-like 64-bit variant.
427     * The method does not produce the same result as either half of the hash bytes from
428     * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code long}.
429     * This method will be removed in a future release.</p>
430     *
431     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
432     * this result as the default seed is positive.</p>
433     *
434     * <p>This is a helper method that will produce the same result as:</p>
435     *
436     * <pre>
437     * int offset = 0;
438     * int seed = 104729;
439     * long hash = MurmurHash3.hash64(ByteBuffer.allocate(8)
440     *                                          .putLong(data)
441     *                                          .array(), offset, 8, seed);
442     * </pre>
443     *
444     * @param data The long to hash
445     * @return The 64-bit hash
446     * @see #hash64(byte[], int, int, int)
447     * @deprecated Not part of the MurmurHash3 implementation.
448     * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code long}.
449     */
450    @Deprecated
451    public static long hash64(final long data) {
452        long hash = DEFAULT_SEED;
453        long k = Long.reverseBytes(data);
454        // mix functions
455        k *= C1;
456        k = Long.rotateLeft(k, R1);
457        k *= C2;
458        hash ^= k;
459        hash = Long.rotateLeft(hash, R2) * M + N1;
460        // finalization
461        hash ^= Long.BYTES;
462        hash = fmix64(hash);
463        return hash;
464    }
465
466    /**
467     * Generates 64-bit hash from an int with a default seed.
468     *
469     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
470     *
471     * <p>This is a Murmur3-like 64-bit variant.
472     * The method does not produce the same result as either half of the hash bytes from
473     * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code int}.
474     * This method will be removed in a future release.</p>
475     *
476     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
477     * this result as the default seed is positive.</p>
478     *
479     * <p>This is a helper method that will produce the same result as:</p>
480     *
481     * <pre>
482     * int offset = 0;
483     * int seed = 104729;
484     * long hash = MurmurHash3.hash64(ByteBuffer.allocate(4)
485     *                                          .putInt(data)
486     *                                          .array(), offset, 4, seed);
487     * </pre>
488     *
489     * @param data The int to hash
490     * @return The 64-bit hash
491     * @see #hash64(byte[], int, int, int)
492     * @deprecated Not part of the MurmurHash3 implementation.
493     * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code int}.
494     */
495    @Deprecated
496    public static long hash64(final int data) {
497        long k1 = Integer.reverseBytes(data) & (-1L >>> 32);
498        long hash = DEFAULT_SEED;
499        k1 *= C1;
500        k1 = Long.rotateLeft(k1, R1);
501        k1 *= C2;
502        hash ^= k1;
503        // finalization
504        hash ^= Integer.BYTES;
505        hash = fmix64(hash);
506        return hash;
507    }
508
509    /**
510     * Generates 64-bit hash from a short with a default seed.
511     *
512     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
513     *
514     * <p>This is a Murmur3-like 64-bit variant.
515     * The method does not produce the same result as either half of the hash bytes from
516     * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code short}.
517     * This method will be removed in a future release.</p>
518     *
519     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
520     * this result as the default seed is positive.</p>
521     *
522     * <p>This is a helper method that will produce the same result as:</p>
523     *
524     * <pre>
525     * int offset = 0;
526     * int seed = 104729;
527     * long hash = MurmurHash3.hash64(ByteBuffer.allocate(2)
528     *                                          .putShort(data)
529     *                                          .array(), offset, 2, seed);
530     * </pre>
531     *
532     * @param data The short to hash
533     * @return The 64-bit hash
534     * @see #hash64(byte[], int, int, int)
535     * @deprecated Not part of the MurmurHash3 implementation.
536     * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code short}.
537     */
538    @Deprecated
539    public static long hash64(final short data) {
540        long hash = DEFAULT_SEED;
541        long k1 = 0;
542        k1 ^= ((long) data & 0xff) << 8;
543        k1 ^= ((long) ((data & 0xFF00) >> 8) & 0xff);
544        k1 *= C1;
545        k1 = Long.rotateLeft(k1, R1);
546        k1 *= C2;
547        hash ^= k1;
548
549        // finalization
550        hash ^= Short.BYTES;
551        hash = fmix64(hash);
552        return hash;
553    }
554
555    /**
556     * Generates 64-bit hash from a byte array with a default seed.
557     *
558     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
559     *
560     * <p>This is a Murmur3-like 64-bit variant.
561     * The method does not produce the same result as either half of the hash bytes from
562     * {@linkplain #hash128x64(byte[])} with the same byte data.
563     * This method will be removed in a future release.</p>
564     *
565     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
566     * this result as the default seed is positive.</p>
567     *
568     * <p>This is a helper method that will produce the same result as:</p>
569     *
570     * <pre>
571     * int offset = 0;
572     * int seed = 104729;
573     * long hash = MurmurHash3.hash64(data, offset, data.length, seed);
574     * </pre>
575     *
576     * @param data The input byte array
577     * @return The 64-bit hash
578     * @see #hash64(byte[], int, int, int)
579     * @deprecated Not part of the MurmurHash3 implementation.
580     * Use half of the hash bytes from {@link #hash128x64(byte[])}.
581     */
582    @Deprecated
583    public static long hash64(final byte[] data) {
584        return hash64(data, 0, data.length, DEFAULT_SEED);
585    }
586
587    /**
588     * Generates 64-bit hash from a byte array with the given offset and length and a default seed.
589     *
590     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
591     *
592     * <p>This is a Murmur3-like 64-bit variant.
593     * The method does not produce the same result as either half of the hash bytes from
594     * {@linkplain #hash128x64(byte[])} with the same byte data.
595     * This method will be removed in a future release.</p>
596     *
597     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
598     * this result as the default seed is positive.</p>
599     *
600     * <p>This is a helper method that will produce the same result as:</p>
601     *
602     * <pre>
603     * int seed = 104729;
604     * long hash = MurmurHash3.hash64(data, offset, length, seed);
605     * </pre>
606     *
607     * @param data The input byte array
608     * @param offset The offset of data
609     * @param length The length of array
610     * @return The 64-bit hash
611     * @see #hash64(byte[], int, int, int)
612     * @deprecated Not part of the MurmurHash3 implementation.
613     * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}.
614     */
615    @Deprecated
616    public static long hash64(final byte[] data, final int offset, final int length) {
617        return hash64(data, offset, length, DEFAULT_SEED);
618    }
619
620    /**
621     * Generates 64-bit hash from a byte array with the given offset, length and seed.
622     *
623     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
624     *
625     * <p>This is a Murmur3-like 64-bit variant.
626     * This method will be removed in a future release.</p>
627     *
628     * <p>This implementation contains a sign-extension bug in the seed initialization.
629     * This manifests if the seed is negative.</p>
630     *
631     * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks
632     * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash
633     * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return
634     * the same value as the first or second 64-bits of the function
635     * {@link #hash128(byte[], int, int, int)}.</p>
636     *
637     * <p>Use of this method is not advised. Use the first long returned from
638     * {@link #hash128x64(byte[], int, int, int)}.</p>
639     *
640     * @param data The input byte array
641     * @param offset The offset of data
642     * @param length The length of array
643     * @param seed The initial seed value
644     * @return The 64-bit hash
645     * @deprecated Not part of the MurmurHash3 implementation.
646     * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}.
647     */
648    @Deprecated
649    public static long hash64(final byte[] data, final int offset, final int length, final int seed) {
650        //
651        // Note: This fails to apply masking using 0xffffffffL to the seed.
652        //
653        long hash = seed;
654        final int nblocks = length >> 3;
655
656        // body
657        for (int i = 0; i < nblocks; i++) {
658            final int index = offset + (i << 3);
659            long k = getLittleEndianLong(data, index);
660
661            // mix functions
662            k *= C1;
663            k = Long.rotateLeft(k, R1);
664            k *= C2;
665            hash ^= k;
666            hash = Long.rotateLeft(hash, R2) * M + N1;
667        }
668
669        // tail
670        long k1 = 0;
671        final int index = offset + (nblocks << 3);
672        switch (offset + length - index) {
673        case 7:
674            k1 ^= ((long) data[index + 6] & 0xff) << 48;
675        case 6:
676            k1 ^= ((long) data[index + 5] & 0xff) << 40;
677        case 5:
678            k1 ^= ((long) data[index + 4] & 0xff) << 32;
679        case 4:
680            k1 ^= ((long) data[index + 3] & 0xff) << 24;
681        case 3:
682            k1 ^= ((long) data[index + 2] & 0xff) << 16;
683        case 2:
684            k1 ^= ((long) data[index + 1] & 0xff) << 8;
685        case 1:
686            k1 ^= ((long) data[index] & 0xff);
687            k1 *= C1;
688            k1 = Long.rotateLeft(k1, R1);
689            k1 *= C2;
690            hash ^= k1;
691        }
692
693        // finalization
694        hash ^= length;
695        hash = fmix64(hash);
696
697        return hash;
698    }
699
700    /**
701     * Generates 128-bit hash from the byte array with a default seed.
702     * This is a helper method that will produce the same result as:
703     *
704     * <pre>
705     * int offset = 0;
706     * int seed = 104729;
707     * int hash = MurmurHash3.hash128(data, offset, data.length, seed);
708     * </pre>
709     *
710     * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
711     * this result as the default seed is positive.</p>
712     *
713     * @param data The input byte array
714     * @return The 128-bit hash (2 longs)
715     * @see #hash128(byte[], int, int, int)
716     */
717    public static long[] hash128(final byte[] data) {
718        return hash128(data, 0, data.length, DEFAULT_SEED);
719    }
720
721    /**
722     * Generates 128-bit hash from the byte array with a seed of zero.
723     * This is a helper method that will produce the same result as:
724     *
725     * <pre>
726     * int offset = 0;
727     * int seed = 0;
728     * int hash = MurmurHash3.hash128x64(data, offset, data.length, seed);
729     * </pre>
730     *
731     * @param data The input byte array
732     * @return The 128-bit hash (2 longs)
733     * @see #hash128x64(byte[], int, int, int)
734     * @since 1.14
735     */
736    public static long[] hash128x64(final byte[] data) {
737        return hash128x64(data, 0, data.length, 0);
738    }
739
740    /**
741     * Generates 128-bit hash from a string with a default seed.
742     * <p>
743     * Before 1.14 the string was converted using default encoding.
744     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
745     * </p>
746     * <p>
747     * This is a helper method that will produce the same result as:
748     * </p>
749     *
750     * <pre>
751     * int offset = 0;
752     * int seed = 104729;
753     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
754     * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed);
755     * </pre>
756     *
757     * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
758     * this result as the default seed is positive.</p>
759     *
760     * @param data The input String
761     * @return The 128-bit hash (2 longs)
762     * @see #hash128(byte[], int, int, int)
763     * @deprecated Use {@link #hash128x64(byte[])} using the bytes returned from
764     * {@link String#getBytes(java.nio.charset.Charset)}.
765     */
766    @Deprecated
767    public static long[] hash128(final String data) {
768        final byte[] bytes = StringUtils.getBytesUtf8(data);
769        return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
770    }
771
772    /**
773     * Generates 128-bit hash from the byte array with the given offset, length and seed.
774     *
775     * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
776     * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
777     *
778     * <p>This implementation contains a sign-extension bug in the seed initialization.
779     * This manifests if the seed is negative.</p>
780     *
781     * @param data The input byte array
782     * @param offset The first element of array
783     * @param length The length of array
784     * @param seed The initial seed value
785     * @return The 128-bit hash (2 longs)
786     * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialization.
787     */
788    @Deprecated
789    public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) {
790        // ************
791        // Note: This deliberately fails to apply masking using 0xffffffffL to the seed
792        // to maintain behavioral compatibility with the original version.
793        // The implicit conversion to a long will extend a negative sign
794        // bit through the upper 32-bits of the long seed. These should be zero.
795        // ************
796        return hash128x64Internal(data, offset, length, seed);
797    }
798
799    /**
800     * Generates 128-bit hash from the byte array with the given offset, length and seed.
801     *
802     * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
803     * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
804     *
805     * @param data The input byte array
806     * @param offset The first element of array
807     * @param length The length of array
808     * @param seed The initial seed value
809     * @return The 128-bit hash (2 longs)
810     * @since 1.14
811     */
812    public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) {
813        // Use an unsigned 32-bit integer as the seed
814        return hash128x64Internal(data, offset, length, seed & 0xffffffffL);
815    }
816
817    /**
818     * Generates 128-bit hash from the byte array with the given offset, length and seed.
819     *
820     * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
821     * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
822     *
823     * @param data The input byte array
824     * @param offset The first element of array
825     * @param length The length of array
826     * @param seed The initial seed value
827     * @return The 128-bit hash (2 longs)
828     */
829    private static long[] hash128x64Internal(final byte[] data, final int offset, final int length, final long seed) {
830        long h1 = seed;
831        long h2 = seed;
832        final int nblocks = length >> 4;
833
834        // body
835        for (int i = 0; i < nblocks; i++) {
836            final int index = offset + (i << 4);
837            long k1 = getLittleEndianLong(data, index);
838            long k2 = getLittleEndianLong(data, index + 8);
839
840            // mix functions for k1
841            k1 *= C1;
842            k1 = Long.rotateLeft(k1, R1);
843            k1 *= C2;
844            h1 ^= k1;
845            h1 = Long.rotateLeft(h1, R2);
846            h1 += h2;
847            h1 = h1 * M + N1;
848
849            // mix functions for k2
850            k2 *= C2;
851            k2 = Long.rotateLeft(k2, R3);
852            k2 *= C1;
853            h2 ^= k2;
854            h2 = Long.rotateLeft(h2, R1);
855            h2 += h1;
856            h2 = h2 * M + N2;
857        }
858
859        // tail
860        long k1 = 0;
861        long k2 = 0;
862        final int index = offset + (nblocks << 4);
863        switch (offset + length - index) {
864        case 15:
865            k2 ^= ((long) data[index + 14] & 0xff) << 48;
866        case 14:
867            k2 ^= ((long) data[index + 13] & 0xff) << 40;
868        case 13:
869            k2 ^= ((long) data[index + 12] & 0xff) << 32;
870        case 12:
871            k2 ^= ((long) data[index + 11] & 0xff) << 24;
872        case 11:
873            k2 ^= ((long) data[index + 10] & 0xff) << 16;
874        case 10:
875            k2 ^= ((long) data[index + 9] & 0xff) << 8;
876        case 9:
877            k2 ^= data[index + 8] & 0xff;
878            k2 *= C2;
879            k2 = Long.rotateLeft(k2, R3);
880            k2 *= C1;
881            h2 ^= k2;
882
883        case 8:
884            k1 ^= ((long) data[index + 7] & 0xff) << 56;
885        case 7:
886            k1 ^= ((long) data[index + 6] & 0xff) << 48;
887        case 6:
888            k1 ^= ((long) data[index + 5] & 0xff) << 40;
889        case 5:
890            k1 ^= ((long) data[index + 4] & 0xff) << 32;
891        case 4:
892            k1 ^= ((long) data[index + 3] & 0xff) << 24;
893        case 3:
894            k1 ^= ((long) data[index + 2] & 0xff) << 16;
895        case 2:
896            k1 ^= ((long) data[index + 1] & 0xff) << 8;
897        case 1:
898            k1 ^= data[index] & 0xff;
899            k1 *= C1;
900            k1 = Long.rotateLeft(k1, R1);
901            k1 *= C2;
902            h1 ^= k1;
903        }
904
905        // finalization
906        h1 ^= length;
907        h2 ^= length;
908
909        h1 += h2;
910        h2 += h1;
911
912        h1 = fmix64(h1);
913        h2 = fmix64(h2);
914
915        h1 += h2;
916        h2 += h1;
917
918        return new long[] { h1, h2 };
919    }
920
921    /**
922     * Gets the little-endian long from 8 bytes starting at the specified index.
923     *
924     * @param data The data
925     * @param index The index
926     * @return The little-endian long
927     */
928    private static long getLittleEndianLong(final byte[] data, final int index) {
929        return (((long) data[index    ] & 0xff)      ) |
930               (((long) data[index + 1] & 0xff) <<  8) |
931               (((long) data[index + 2] & 0xff) << 16) |
932               (((long) data[index + 3] & 0xff) << 24) |
933               (((long) data[index + 4] & 0xff) << 32) |
934               (((long) data[index + 5] & 0xff) << 40) |
935               (((long) data[index + 6] & 0xff) << 48) |
936               (((long) data[index + 7] & 0xff) << 56);
937    }
938
939    /**
940     * Gets the little-endian int from 4 bytes starting at the specified index.
941     *
942     * @param data The data
943     * @param index The index
944     * @return The little-endian int
945     */
946    private static int getLittleEndianInt(final byte[] data, final int index) {
947        return ((data[index    ] & 0xff)      ) |
948               ((data[index + 1] & 0xff) <<  8) |
949               ((data[index + 2] & 0xff) << 16) |
950               ((data[index + 3] & 0xff) << 24);
951    }
952
953    /**
954     * Performs the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
955     *
956     * @param k The data to add to the hash
957     * @param hash The current hash
958     * @return The new hash
959     */
960    private static int mix32(int k, int hash) {
961        k *= C1_32;
962        k = Integer.rotateLeft(k, R1_32);
963        k *= C2_32;
964        hash ^= k;
965        return Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
966    }
967
968    /**
969     * Performs the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
970     *
971     * @param hash The current hash
972     * @return The final hash
973     */
974    private static int fmix32(int hash) {
975        hash ^= (hash >>> 16);
976        hash *= 0x85ebca6b;
977        hash ^= (hash >>> 13);
978        hash *= 0xc2b2ae35;
979        hash ^= (hash >>> 16);
980        return hash;
981    }
982
983    /**
984     * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}.
985     *
986     * @param hash The current hash
987     * @return The final hash
988     */
989    private static long fmix64(long hash) {
990        hash ^= (hash >>> 33);
991        hash *= 0xff51afd7ed558ccdL;
992        hash ^= (hash >>> 33);
993        hash *= 0xc4ceb9fe1a85ec53L;
994        hash ^= (hash >>> 33);
995        return hash;
996    }
997
998    /**
999     * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
1000     * hash computed.
1001     *
1002     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
1003     * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
1004     *
1005     * @since 1.14
1006     */
1007    public static class IncrementalHash32x86 {
1008
1009        /** The size of byte blocks that are processed together. */
1010        private static final int BLOCK_SIZE = 4;
1011
1012        /** Up to 3 unprocessed bytes from input data. */
1013        private final byte[] unprocessed = new byte[3];
1014
1015        /** The number of unprocessed bytes in the tail data. */
1016        private int unprocessedLength;
1017
1018        /** The total number of input bytes added since the start. */
1019        private int totalLen;
1020
1021        /**
1022         * The current running hash.
1023         * This must be finalised to generate the 32-bit hash value.
1024         */
1025        private int hash;
1026
1027        /**
1028         * Starts a new incremental hash.
1029         *
1030         * @param seed The initial seed value
1031         */
1032        public final void start(final int seed) {
1033            // Reset
1034            unprocessedLength = totalLen = 0;
1035            this.hash = seed;
1036        }
1037
1038        /**
1039         * Adds the byte array to the current incremental hash.
1040         *
1041         * @param data The input byte array
1042         * @param offset The offset of data
1043         * @param length The length of array
1044         */
1045        public final void add(final byte[] data, final int offset, final int length) {
1046            if (length <= 0) {
1047                // Nothing to add
1048                return;
1049            }
1050            totalLen += length;
1051
1052            // Process the bytes in blocks of 4.
1053            // New bytes must be added to any current unprocessed bytes,
1054            // then processed in blocks of 4 and the remaining bytes saved:
1055            //
1056            //    |--|---------------------------|--|
1057            // unprocessed
1058            //                main block
1059            //                                remaining
1060
1061            // Check if the unprocessed bytes and new bytes can fill a block of 4.
1062            // Make this overflow safe in the event that length is Integer.MAX_VALUE.
1063            // Equivalent to: (unprocessedLength + length < BLOCK_SIZE)
1064            if (unprocessedLength + length - BLOCK_SIZE < 0) {
1065                // Not enough so add to the unprocessed bytes
1066                System.arraycopy(data, offset, unprocessed, unprocessedLength, length);
1067                unprocessedLength += length;
1068                return;
1069            }
1070
1071            // Combine unprocessed bytes with new bytes.
1072            final int newOffset;
1073            final int newLength;
1074            if (unprocessedLength > 0) {
1075                int k = -1;
1076                switch (unprocessedLength) {
1077                case 1:
1078                    k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]);
1079                    break;
1080                case 2:
1081                    k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]);
1082                    break;
1083                case 3:
1084                    k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]);
1085                    break;
1086                default:
1087                    throw new IllegalStateException("Unprocessed length should be 1, 2, or 3: " + unprocessedLength);
1088                }
1089                hash = mix32(k, hash);
1090                // Update the offset and length
1091                final int consumed = BLOCK_SIZE - unprocessedLength;
1092                newOffset = offset + consumed;
1093                newLength = length - consumed;
1094            } else {
1095                newOffset = offset;
1096                newLength = length;
1097            }
1098
1099            // Main processing of blocks of 4 bytes
1100            final int nblocks = newLength >> 2;
1101
1102            for (int i = 0; i < nblocks; i++) {
1103                final int index = newOffset + (i << 2);
1104                final int k = getLittleEndianInt(data, index);
1105                hash = mix32(k, hash);
1106            }
1107
1108            // Save left-over unprocessed bytes
1109            final int consumed = (nblocks << 2);
1110            unprocessedLength = newLength - consumed;
1111            if (unprocessedLength != 0) {
1112                System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength);
1113            }
1114        }
1115
1116        /**
1117         * Generates the 32-bit hash value. Repeat calls to this method with no additional data
1118         * will generate the same hash value.
1119         *
1120         * @return The 32-bit hash
1121         */
1122        public final int end() {
1123            // Allow calling end() again after adding no data to return the same result.
1124            return finalise(hash, unprocessedLength, unprocessed, totalLen);
1125        }
1126
1127        /**
1128         * Finalizes the running hash to the output 32-bit hash by processing remaining bytes
1129         * and performing final mixing.
1130         *
1131         * @param hash The running hash
1132         * @param unprocessedLength The number of unprocessed bytes in the tail data.
1133         * @param unprocessed Up to 3 unprocessed bytes from input data.
1134         * @param totalLen The total number of input bytes added since the start.
1135         * @return The 32-bit hash
1136         */
1137        int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) {
1138            int result = hash;
1139            int k1 = 0;
1140            switch (unprocessedLength) {
1141            case 3:
1142                k1 ^= (unprocessed[2] & 0xff) << 16;
1143            case 2:
1144                k1 ^= (unprocessed[1] & 0xff) << 8;
1145            case 1:
1146                k1 ^= (unprocessed[0] & 0xff);
1147
1148                // mix functions
1149                k1 *= C1_32;
1150                k1 = Integer.rotateLeft(k1, R1_32);
1151                k1 *= C2_32;
1152                result ^= k1;
1153            }
1154
1155            // finalization
1156            result ^= totalLen;
1157            return fmix32(result);
1158        }
1159
1160        /**
1161         * Combines the bytes using an Or operation ({@code | } in a little-endian representation
1162         * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most
1163         * significant.
1164         *
1165         * @param b1 The first byte
1166         * @param b2 The second byte
1167         * @param b3 The third byte
1168         * @param b4 The fourth byte
1169         * @return The 32-bit integer
1170         */
1171        private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) {
1172            return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24);
1173        }
1174    }
1175
1176    /**
1177     * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
1178     * hash computed.
1179     *
1180     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
1181     * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
1182     *
1183     * <p>This implementation contains a sign-extension bug in the finalization step of
1184     * any bytes left over from dividing the length by 4. This manifests if any of these
1185     * bytes are negative.</p>
1186     *
1187     * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
1188     */
1189    @Deprecated
1190    public static class IncrementalHash32 extends IncrementalHash32x86 {
1191
1192        /**
1193         * {@inheritDoc}
1194         *
1195         * <p>This implementation contains a sign-extension bug in the finalization step of
1196         * any bytes left over from dividing the length by 4. This manifests if any of these
1197         * bytes are negative.<p>
1198         *
1199         * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
1200         */
1201        @Override
1202        @Deprecated
1203        int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) {
1204            int result = hash;
1205            // ************
1206            // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
1207            // ************
1208            int k1 = 0;
1209            switch (unprocessedLength) {
1210            case 3:
1211                k1 ^= unprocessed[2] << 16;
1212            case 2:
1213                k1 ^= unprocessed[1] << 8;
1214            case 1:
1215                k1 ^= unprocessed[0];
1216
1217                // mix functions
1218                k1 *= C1_32;
1219                k1 = Integer.rotateLeft(k1, R1_32);
1220                k1 *= C2_32;
1221                result ^= k1;
1222            }
1223
1224            // finalization
1225            result ^= totalLen;
1226            return fmix32(result);
1227        }
1228    }
1229}