murmurhash2 Python实现(复现开源项目主页上的 Murmurhash2)


  • 1. 简介
  • 1. 原始版本的murmurhash2算法(Google Code 的 Murmurhash 开源项目主页上的 Murmurhash2)
  • 2. Python版

1. 简介

Murmur哈希是一种非加密散列函数,适用于一般的基于散列的查找。它在2008年由Austin Appleby创建,在Github上托管,名为“SMHasher” 的测试套件。

1. 原始版本的murmurhash2算法(Google Code 的 Murmurhash 开源项目主页上的 Murmurhash2)

uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed )
  // 'm' and 'r' are mixing constants generated offline.
  // They're not really 'magic', they just happen to work well.
  const uint32_t m = 0x5bd1e995;
  const int r = 24;
  // Initialize the hash to a 'random' value
  uint32_t h = seed ^ len;
  // Mix 4 bytes at a time into the hash
  const unsigned char * data = (const unsigned char *)key;
  while(len >= 4)
    uint32_t k = *(uint32_t*)data;
    k *= m;
    k ^= k >> r;
    k *= m;
    h *= m;
    h ^= k;
    data += 4;
    len -= 4;
  // Handle the last few bytes of the input array
  case 3: h ^= data[2] << 16;
  case 2: h ^= data[1] << 8;
  case 1: h ^= data[0];
      h *= m;
  // Do a few final mixes of the hash to ensure the last few
  // bytes are well-incorporated.
  h ^= h >> 13;
  h *= m;
  h ^= h >> 15;
  return h;

2. Python版

import ctypes

def int_overflow(val):
    # 实现溢出能力
    maxint = 2147483647
    if not -maxint-1 <= val <= maxint:
        val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
    return val

# 逻辑右移,给出r默认值24,这里没用到
def unsigned_right_shitf(n, r = 24):
    # 数字小于0,则转为32位无符号uint
    if n < 0:
        n = ctypes.c_uint32(n).value
    # 正常位移位数是为正数,但是为了兼容js之类的,负数就右移变成左移好了
    if r < 0:
        return -int_overflow(n << abs(r))
    return int_overflow(n >> r)

# 大数乘法,m = 0x5bd13995 = 1540483477
def int_overflow_multiplication(a, m = 1540483477):
    result = a * m
    result = int_overflow(result)
    return result

# seed 可以改自己的
def murmurhash(origin_string, seed = 111111):
    origin_bytes = origin_string.encode()

    length = len(origin_bytes)
    h = seed ^ length
    i = 0
    r = 24
    const = 0xff

    while (length >= 4):

        k = (origin_bytes[i] & const) + ((origin_bytes[i + 1] & const) << 8) + ((origin_bytes[i + 2] & const) << 16) + ((origin_bytes[i + 3] & const) << 24)
        k = int_overflow_multiplication(k)
        k ^= k >> r
        k = int_overflow_multiplication(k)
        h = int_overflow_multiplication(h)
        h ^= k
        length -= 4
        i += 4

    if (length == 3):
        h ^= (origin_bytes[i + 2] & const) << 16
        h ^= (origin_bytes[i + 1] & const) << 8
        h ^= (origin_bytes[i] & const)
        h = int_overflow_multiplication(h)

    if (length == 2):
        h ^= (origin_bytes[i + 1] & const) << 8
        h ^= (origin_bytes[i] & const)
        h = int_overflow_multiplication(h)

    if (length == 1):
        h ^= (origin_bytes[i] & const)
        h = int_overflow_multiplication(h)

    h ^= h >> 13
    h = int_overflow_multiplication(h)
    h ^= h >> 15

    return h

if __name__ == '__main__':
