MurmurHash2.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.digest;

import org.apache.commons.codec.binary.StringUtils;

/**
 * Implements the MurmurHash2 32-bit and 64-bit hash functions.
 *
 * <p>MurmurHash is a non-cryptographic hash function suitable for general
 * hash-based lookup. The name comes from two basic operations, multiply (MU)
 * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
 * it is not specifically designed to be difficult to reverse by an adversary,
 * making it unsuitable for cryptographic purposes.</p>
 *
 * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
 * and the 64-bit hash function {@code MurmurHash64A} from Austin Appleby's
 * original {@code c++} code in SMHasher.</p>
 *
 * <p>This is a re-implementation of the original C code plus some additional
 * features.</p>
 *
 * <p>This is public domain code with no copyrights. From home page of
 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
 *
 * <blockquote>
 * "All MurmurHash versions are public domain software, and the author
 * disclaims all copyright to their code."
 * </blockquote>
 *
 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
 *   Original MurmurHash2 c++ code</a>
 * @since 1.13
 */
public final class MurmurHash2 {

    // Constants for 32-bit variant
    private static final int M32 = 0x5bd1e995;
    private static final int R32 = 24;

    // Constants for 64-bit variant
    private static final long M64 = 0xc6a4a7935bd1e995L;
    private static final int R64 = 47;

    /**
     * Gets the little-endian int from 4 bytes starting at the specified index.
     *
     * @param data The data
     * @param index The index
     * @return The little-endian int
     */
    private static int getLittleEndianInt(final byte[] data, final int index) {
        return data[index    ] & 0xff |
               (data[index + 1] & 0xff) <<  8 |
               (data[index + 2] & 0xff) << 16 |
               (data[index + 3] & 0xff) << 24;
    }

    /**
     * Gets the little-endian long from 8 bytes starting at the specified index.
     *
     * @param data The data
     * @param index The index
     * @return The little-endian long
     */
    private static long getLittleEndianLong(final byte[] data, final int index) {
        return (long) data[index    ] & 0xff |
               ((long) data[index + 1] & 0xff) <<  8 |
               ((long) data[index + 2] & 0xff) << 16 |
               ((long) data[index + 3] & 0xff) << 24 |
               ((long) data[index + 4] & 0xff) << 32 |
               ((long) data[index + 5] & 0xff) << 40 |
               ((long) data[index + 6] & 0xff) << 48 |
               ((long) data[index + 7] & 0xff) << 56;
    }

    /**
     * Generates a 32-bit hash from byte array with the given length and a default seed value.
     * This is a helper method that will produce the same result as:
     *
     * <pre>
     * int seed = 0x9747b28c;
     * int hash = MurmurHash2.hash32(data, length, seed);
     * </pre>
     *
     * @param data The input byte array
     * @param length The length of the array
     * @return The 32-bit hash
     * @see #hash32(byte[], int, int)
     */
    public static int hash32(final byte[] data, final int length) {
        return hash32(data, length, 0x9747b28c);
    }

    /**
     * Generates a 32-bit hash from byte array with the given length and seed.
     *
     * @param data The input byte array
     * @param length The length of the array
     * @param seed The initial seed value
     * @return The 32-bit hash
     */
    public static int hash32(final byte[] data, final int length, final int seed) {
        // Initialize the hash to a random value
        int h = seed ^ length;

        // Mix 4 bytes at a time into the hash
        final int nblocks = length >> 2;

        // body
        for (int i = 0; i < nblocks; i++) {
            final int index = i << 2;
            int k = getLittleEndianInt(data, index);
            k *= M32;
            k ^= k >>> R32;
            k *= M32;
            h *= M32;
            h ^= k;
        }

        // Handle the last few bytes of the input array
        final int index = nblocks << 2;
        switch (length - index) {
        case 3:
            h ^= (data[index + 2] & 0xff) << 16;
        case 2:
            h ^= (data[index + 1] & 0xff) << 8;
        case 1:
            h ^= data[index] & 0xff;
            h *= M32;
        }

        // Do a few final mixes of the hash to ensure the last few
        // bytes are well-incorporated.
        h ^= h >>> 13;
        h *= M32;
        h ^= h >>> 15;

        return h;
    }

    /**
     * Generates a 32-bit hash from a string with a default seed.
     * <p>
     * Before 1.14 the string was converted using default encoding.
     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
     * </p>
     * This is a helper method that will produce the same result as:
     *
     * <pre>
     * int seed = 0x9747b28c;
     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
     * </pre>
     *
     * @param text The input string
     * @return The 32-bit hash
     * @see #hash32(byte[], int, int)
     */
    public static int hash32(final String text) {
        final byte[] bytes = StringUtils.getBytesUtf8(text);
        return hash32(bytes, bytes.length);
    }

    /**
     * Generates a 32-bit hash from a substring with a default seed value.
     * The string is converted to bytes using the default encoding.
     * This is a helper method that will produce the same result as:
     *
     * <pre>
     * int seed = 0x9747b28c;
     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
     * </pre>
     *
     * @param text The input string
     * @param from The starting index
     * @param length The length of the substring
     * @return The 32-bit hash
     * @see #hash32(byte[], int, int)
     */
    public static int hash32(final String text, final int from, final int length) {
        return hash32(text.substring(from, from + length));
    }

    /**
     * Generates a 64-bit hash from byte array with given length and a default seed value.
     * This is a helper method that will produce the same result as:
     *
     * <pre>
     * int seed = 0xe17a1465;
     * int hash = MurmurHash2.hash64(data, length, seed);
     * </pre>
     *
     * @param data The input byte array
     * @param length The length of the array
     * @return The 64-bit hash
     * @see #hash64(byte[], int, int)
     */
    public static long hash64(final byte[] data, final int length) {
        return hash64(data, length, 0xe17a1465);
    }

    /**
     * Generates a 64-bit hash from byte array of the given length and seed.
     *
     * @param data The input byte array
     * @param length The length of the array
     * @param seed The initial seed value
     * @return The 64-bit hash of the given array
     */
    public static long hash64(final byte[] data, final int length, final int seed) {
        long h = seed & 0xffffffffL ^ length * M64;

        final int nblocks = length >> 3;

        // body
        for (int i = 0; i < nblocks; i++) {
            final int index = i << 3;
            long k = getLittleEndianLong(data, index);

            k *= M64;
            k ^= k >>> R64;
            k *= M64;

            h ^= k;
            h *= M64;
        }

        final int index = nblocks << 3;
        switch (length - index) {
        case 7:
            h ^= ((long) data[index + 6] & 0xff) << 48;
        case 6:
            h ^= ((long) data[index + 5] & 0xff) << 40;
        case 5:
            h ^= ((long) data[index + 4] & 0xff) << 32;
        case 4:
            h ^= ((long) data[index + 3] & 0xff) << 24;
        case 3:
            h ^= ((long) data[index + 2] & 0xff) << 16;
        case 2:
            h ^= ((long) data[index + 1] & 0xff) << 8;
        case 1:
            h ^= (long) data[index] & 0xff;
            h *= M64;
        }

        h ^= h >>> R64;
        h *= M64;
        h ^= h >>> R64;

        return h;
    }

    /**
     * Generates a 64-bit hash from a string with a default seed.
     * <p>
     * Before 1.14 the string was converted using default encoding.
     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
     * </p>
     * <p>
     * This is a helper method that will produce the same result as:
     * </p>
     *
     * <pre>
     * int seed = 0xe17a1465;
     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
     * </pre>
     *
     * @param text The input string
     * @return The 64-bit hash
     * @see #hash64(byte[], int, int)
     */
    public static long hash64(final String text) {
        final byte[] bytes = StringUtils.getBytesUtf8(text);
        return hash64(bytes, bytes.length);
    }

    /**
     * Generates a 64-bit hash from a substring with a default seed value.
     * The string is converted to bytes using the default encoding.
     * This is a helper method that will produce the same result as:
     *
     * <pre>
     * int seed = 0xe17a1465;
     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
     * </pre>
     *
     * @param text The input string
     * @param from The starting index
     * @param length The length of the substring
     * @return The 64-bit hash
     * @see #hash64(byte[], int, int)
     */
    public static long hash64(final String text, final int from, final int length) {
        return hash64(text.substring(from, from + length));
    }

    /** No instance methods. */
    private MurmurHash2() {
    }
}