问题:两个海量字符串数据集,在内存有限、不能生成中间文件的前提下,如何求出它们的交集?
答:将字符串转换成数字处理。本文包含6种一次hash冲突概率极低的hash函数,其性能比较为 CityHash64>FarmHash64>Murmur3>Murmur2_64A>Murmur2_64B>Blizzard_MPQ
#include
using namespace std;
using ui = unsigned int;
using ul = unsigned long;
using uc = unsigned char;
#define Type 0
ul MagicTable[0x500];
void init(){
ul seed =0x00100001, index1 =0, index2 =0, i;
for(index1 = 0; index1 < 0x100; ++index1){
for(index2 = index1, i = 0; i < 5; ++i, index2 += 0x100){
ul temp1, temp2;
seed = (seed * 125 + 3) % 0x2AAAAB;
temp1 = (seed & 0xFFFF) << 0x10;
seed = (seed * 125 + 3) % 0x2AAAAB;
temp2 = (seed & 0xFFFF);
MagicTable[index2] = ( temp1 | temp2 );
}
}
return ;
}
// Blizzard
ul MPQHash(const char* str, ul HashType){
uc *key = (uc*)str;
ul seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE;
int ch;
while(*key != 0){
ch = toupper(*key++);
seed1 = MagicTable[(HashType <<8) + ch] ^ (seed1 + seed2);
seed2 = ch + seed1 + seed2 + (seed2 << 5) +3;
}
return seed1;
}
int main(int argc, char**argv){
init();
string str = "最強|Hash #6";
ul x = MPQHash(str.c_str(), Type);
cout<< x <<endl;
return 0;
}
#include
using namespace std;
using ui = unsigned int;
using ul = unsigned long;
using uc = unsigned char;
using ull = unsigned long long;
#define SEED 0xEE6B27EB
ull MurmurHash64B(const void * key, int len, ui seed){
const ui m = 0x5bd1e995;
const int r = 24;
ui h1 = seed ^ len;
ui h2 = 0;
const ui * data = (const ui *)key;
while(len >= 8){
ui k1 = *data++;
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
ui k2 = *data++;
k2 *= m; k2 ^= k2 >> r; k2 *= m;
h2 *= m; h2 ^= k2;
len -= 4;
}
if(len >= 4){
ui k1 = *data++;
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
}
switch(len){
case 3: h2 ^= ((uc*)data)[2] << 16;
case 2: h2 ^= ((uc*)data)[1] << 8;
case 1: h2 ^= ((uc*)data)[0];
h2 *= m;
};
h1 ^= h2 >> 18; h1 *= m;
h2 ^= h1 >> 22; h2 *= m;
h1 ^= h2 >> 17; h1 *= m;
h2 ^= h1 >> 19; h2 *= m;
ull h = h1;
h = (h << 32) | h2;
return h;
}
int main(int argc, char**argv){
string str = "最強|Hash #5";
ull x = MurmurHash64B(str.c_str(), str.size(), SEED);
cout<< x <<endl;
return 0;
}
#include
using namespace std;
using ui = unsigned int;
using ul = unsigned long;
using uc = unsigned char;
using ull = unsigned long long;
#define SEED 16
ull MurmurHash64A( const void * key, int len, ui seed){
const ull m = 0xc6a4a7935bd1e995;
const int r = 47;
ull h = seed ^ (len * m);
const ull * data = (const ull *)key;
const ull * end = data + (len/8);
while(data != end){
ull k = *data++;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
const uc * data2 = (const uc*)data;
switch(len & 7){
case 7: h ^= ull(data2[6]) << 48;
case 6: h ^= ull(data2[5]) << 40;
case 5: h ^= ull(data2[4]) << 32;
case 4: h ^= ull(data2[3]) << 24;
case 3: h ^= ull(data2[2]) << 16;
case 2: h ^= ull(data2[1]) << 8;
case 1: h ^= ull(data2[0]);
h *= m;
};
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
int main(int argc, char**argv){
string str = "最強|Hash #4";
ull x = MurmurHash64A(str.c_str(), str.size(), SEED);
cout<< x <<endl;
return 0;
}
#include
using namespace std;
using ui = unsigned int;
using ul = unsigned long;
using uc = unsigned char;
using ull = unsigned long long;
#define FORCE_INLINE inline
#define BIG_CONSTANT(x) (x##LLU)
static FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r )
{
return (x << r) | (x >> (64 - r));
}
FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
{
return p[i];
}
FORCE_INLINE uint64_t fmix64 ( uint64_t k )
{
k ^= k >> 33;
k *= BIG_CONSTANT(0xff51afd7ed558ccd);
k ^= k >> 33;
k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
k ^= k >> 33;
return k;
}
#define SEED 42
#define ROTL64(x,y) rotl64(x,y)
uint64_t MurmurHash3_x64_128 ( const void * key, const int len, const uint32_t seed)
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;
uint64_t h1 = seed;
uint64_t h2 = seed;
const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
//----------
// body
const uint64_t * blocks = (const uint64_t *)(data);
for(int i = 0; i < nblocks; i++)
{
uint64_t k1 = getblock64(blocks,i*2+0);
uint64_t k2 = getblock64(blocks,i*2+1);
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
uint64_t k1 = 0;
uint64_t k2 = 0;
switch(len & 15)
{
case 15: k2 ^= ((uint64_t)tail[14]) << 48;
case 14: k2 ^= ((uint64_t)tail[13]) << 40;
case 13: k2 ^= ((uint64_t)tail[12]) << 32;
case 12: k2 ^= ((uint64_t)tail[11]) << 24;
case 11: k2 ^= ((uint64_t)tail[10]) << 16;
case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
case 9: k2 ^= ((uint64_t)tail[ 8]) << 0;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
case 8: k1 ^= ((uint64_t)tail[ 7]) << 56;
case 7: k1 ^= ((uint64_t)tail[ 6]) << 48;
case 6: k1 ^= ((uint64_t)tail[ 5]) << 40;
case 5: k1 ^= ((uint64_t)tail[ 4]) << 32;
case 4: k1 ^= ((uint64_t)tail[ 3]) << 24;
case 3: k1 ^= ((uint64_t)tail[ 2]) << 16;
case 2: k1 ^= ((uint64_t)tail[ 1]) << 8;
case 1: k1 ^= ((uint64_t)tail[ 0]) << 0;
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= len; h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
h2 += h1;
return h1;
}
int main(int argc, char**argv){
string str = "最強|Hash #3";
uint64_t x = MurmurHash3_x64_128(str.c_str(), str.size(), SEED);
cout<< x <<endl;
return 0;
}
//Google FarmHash
#include
typedef std::pair<uint64_t, uint64_t> uint128_t;
inline uint64_t Uint128Low64(const uint128_t x) { return x.first; }
inline uint64_t Uint128High64(const uint128_t x) { return x.second; }
inline uint128_t Uint128(uint64_t lo, uint64_t hi) { return uint128_t(lo, hi); }
#define STATIC_INLINE static inline
using namespace std;
using ui = unsigned int;
using ul = unsigned long;
using uc = unsigned char;
using ull = unsigned long long;
// Some primes between 2^63 and 2^64 for various uses.
static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
static const uint64_t k1 = 0xb492b66fbe98f273ULL;
static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
#define uint32_in_expected_order(x) (x)
#define uint64_in_expected_order(x) (x)
STATIC_INLINE uint64_t Fetch64(const char *p) {
uint64_t result;
memcpy(&result, p, sizeof(result));
return uint64_in_expected_order(result);
}
STATIC_INLINE uint32_t Fetch32(const char *p) {
uint32_t result;
memcpy(&result, p, sizeof(result));
return uint32_in_expected_order(result);
}
STATIC_INLINE uint32_t bswap_32(const uint32_t x) {
uint32_t y = x;
for (size_t i = 0; i<sizeof(uint32_t)>> 1; i++) {
uint32_t d = sizeof(uint32_t) - i - 1;
uint32_t mh = ((uint32_t)0xff) << (d << 3);
uint32_t ml = ((uint32_t)0xff) << (i << 3);
uint32_t h = x & mh;
uint32_t l = x & ml;
uint64_t t = (l << ((d - i) << 3)) | (h >> ((d - i) << 3));
y = t | (y & ~(mh | ml));
}
return y;
}
STATIC_INLINE uint64_t bswap_64(const uint64_t x) {
uint64_t y = x;
for (size_t i = 0; i<sizeof(uint64_t)>> 1; i++) {
uint64_t d = sizeof(uint64_t) - i - 1;
uint64_t mh = ((uint64_t)0xff) << (d << 3);
uint64_t ml = ((uint64_t)0xff) << (i << 3);
uint64_t h = x & mh;
uint64_t l = x & ml;
uint64_t t = (l << ((d - i) << 3)) | (h >> ((d - i) << 3));
y = t | (y & ~(mh | ml));
}
return y;
}
STATIC_INLINE uint32_t Bswap32(uint32_t val) { return bswap_32(val); }
STATIC_INLINE uint64_t Bswap64(uint64_t val) { return bswap_64(val); }
// FARMHASH PORTABILITY LAYER: bitwise rot
STATIC_INLINE uint32_t BasicRotate32(uint32_t val, int shift) {
// Avoid shifting by 32: doing so yields an undefined result.
return shift == 0 ? val : ((val >> shift) | (val << (32 - shift)));
}
STATIC_INLINE uint64_t BasicRotate64(uint64_t val, int shift) {
// Avoid shifting by 64: doing so yields an undefined result.
return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
}
STATIC_INLINE uint32_t Rotate32(uint32_t val, int shift) {
return BasicRotate32(val, shift);
}
STATIC_INLINE uint64_t Rotate64(uint64_t val, int shift) {
return BasicRotate64(val, shift);
}
// Hash 128 input bits down to 64 bits of output.
// This is intended to be a reasonably good hash function.
// May change from time to time, may differ on different platforms, may differ
// depending on NDEBUG.
STATIC_INLINE uint64_t Hash128to64(uint128_t x) {
// Murmur-inspired hashing.
const uint64_t kMul = 0x9ddfea08eb382d69ULL;
uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
a ^= (a >> 47);
uint64_t b = (Uint128High64(x) ^ a) * kMul;
b ^= (b >> 47);
b *= kMul;
return b;
}
STATIC_INLINE uint64_t ShiftMix(uint64_t val) {
return val ^ (val >> 47);
}
STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v) {
return Hash128to64(Uint128(u, v));
}
STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v, uint64_t mul) {
// Murmur-inspired hashing.
uint64_t a = (u ^ v) * mul;
a ^= (a >> 47);
uint64_t b = (v ^ a) * mul;
b ^= (b >> 47);
b *= mul;
return b;
}
STATIC_INLINE uint64_t HashLen0to16(const char *s, size_t len) {
if (len >= 8) {
uint64_t mul = k2 + len * 2;
uint64_t a = Fetch64(s) + k2;
uint64_t b = Fetch64(s + len - 8);
uint64_t c = Rotate64(b, 37) * mul + a;
uint64_t d = (Rotate64(a, 25) + b) * mul;
return HashLen16(c, d, mul);
}
if (len >= 4) {
uint64_t mul = k2 + len * 2;
uint64_t a = Fetch32(s);
return HashLen16(len + (a << 3), Fetch32(s + len - 4), mul);
}
if (len > 0) {
uint8_t a = s[0];
uint8_t b = s[len >> 1];
uint8_t c = s[len - 1];
uint32_t y = static_cast<uint32_t>(a) + (static_cast<uint32_t>(b) << 8);
uint32_t z = len + (static_cast<uint32_t>(c) << 2);
return ShiftMix(y * k2 ^ z * k0) * k2;
}
return k2;
}
// This probably works well for 16-byte strings as well, but it may be overkill
// in that case.
STATIC_INLINE uint64_t HashLen17to32(const char *s, size_t len) {
uint64_t mul = k2 + len * 2;
uint64_t a = Fetch64(s) * k1;
uint64_t b = Fetch64(s + 8);
uint64_t c = Fetch64(s + len - 8) * mul;
uint64_t d = Fetch64(s + len - 16) * k2;
return HashLen16(Rotate64(a + b, 43) + Rotate64(c, 30) + d,
a + Rotate64(b + k2, 18) + c, mul);
}
// Return a 16-byte hash for 48 bytes. Quick and dirty.
// Callers do best to use "random-looking" values for a and b.
STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
uint64_t w, uint64_t x, uint64_t y, uint64_t z, uint64_t a, uint64_t b) {
a += w;
b = Rotate64(b + a + z, 21);
uint64_t c = a;
a += x;
a += y;
b += Rotate64(a, 44);
return make_pair(a + z, b + c);
}
// Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty.
STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
const char* s, uint64_t a, uint64_t b) {
return WeakHashLen32WithSeeds(Fetch64(s),
Fetch64(s + 8),
Fetch64(s + 16),
Fetch64(s + 24),
a,
b);
}
// Return an 8-byte hash for 33 to 64 bytes.
STATIC_INLINE uint64_t HashLen33to64(const char *s, size_t len) {
uint64_t mul = k2 + len * 2;
uint64_t a = Fetch64(s) * k2;
uint64_t b = Fetch64(s + 8);
uint64_t c = Fetch64(s + len - 8) * mul;
uint64_t d = Fetch64(s + len - 16) * k2;
uint64_t y = Rotate64(a + b, 43) + Rotate64(c, 30) + d;
uint64_t z = HashLen16(y, a + Rotate64(b + k2, 18) + c, mul);
uint64_t e = Fetch64(s + 16) * mul;
uint64_t f = Fetch64(s + 24);
uint64_t g = (y + Fetch64(s + len - 32)) * mul;
uint64_t h = (z + Fetch64(s + len - 24)) * mul;
return HashLen16(Rotate64(e + f, 43) + Rotate64(g, 30) + h,
e + Rotate64(f + a, 18) + g, mul);
}
uint64_t FarmHash64(const char *s, size_t len) {
const uint64_t seed = 81;
if (len <= 32) {
if (len <= 16) {
return HashLen0to16(s, len);
} else {
return HashLen17to32(s, len);
}
} else if (len <= 64) {
return HashLen33to64(s, len);
}
// For strings over 64 bytes we loop. Internal state consists of
// 56 bytes: v, w, x, y, and z.
uint64_t x = seed;
uint64_t y = seed * k1 + 113;
uint64_t z = ShiftMix(y * k2 + 113) * k2;
pair<uint64_t, uint64_t> v = make_pair(0, 0);
pair<uint64_t, uint64_t> w = make_pair(0, 0);
x = x * k2 + Fetch64(s);
// Set end so that after the loop we have 1 to 64 bytes left to process.
const char* end = s + ((len - 1) / 64) * 64;
const char* last64 = end + ((len - 1) & 63) - 63;
assert(s + len - 64 == last64);
do {
x = Rotate64(x + y + v.first + Fetch64(s + 8), 37) * k1;
y = Rotate64(y + v.second + Fetch64(s + 48), 42) * k1;
x ^= w.second;
y += v.first + Fetch64(s + 40);
z = Rotate64(z + w.first, 33) * k1;
v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
std::swap(z, x);
s += 64;
} while (s != end);
uint64_t mul = k1 + ((z & 0xff) << 1);
// Make s point to the last 64 bytes of input.
s = last64;
w.first += ((len - 1) & 63);
v.first += w.first;
w.first += v.first;
x = Rotate64(x + y + v.first + Fetch64(s + 8), 37) * mul;
y = Rotate64(y + v.second + Fetch64(s + 48), 42) * mul;
x ^= w.second * 9;
y += v.first * 9 + Fetch64(s + 40);
z = Rotate64(z + w.first, 33) * mul;
v = WeakHashLen32WithSeeds(s, v.second * mul, x + w.first);
w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
std::swap(z, x);
return HashLen16(HashLen16(v.first, w.first, mul) + ShiftMix(y) * k0 + z,
HashLen16(v.second, w.second, mul) + x,
mul);
}
int main(int argc, char** argv){
string str = "最強|Hash #2";
uint64_t x = FarmHash64(str.c_str(), str.size());
cout<< x <<endl;
return 0;
}
//Google CityHash
#include
using namespace std;
typedef std::pair<uint64_t, uint64_t> uint128_t;
#define STATIC_INLINE static inline
STATIC_INLINE uint64_t Uint128Low64(const uint128_t& x) { return x.first; }
STATIC_INLINE uint64_t Uint128High64(const uint128_t& x) { return x.second; }
static const uint64_t kSeed0 = 1234567;
static const uint64_t kSeed1 = 0xc3a5c85c97cb3127ULL;
static const uint128_t kSeed128(kSeed0, kSeed1);
STATIC_INLINE uint64_t UNALIGNED_LOAD64(const char *p) {
uint64_t result;
memcpy(&result, p, sizeof(result));
return result;
}
STATIC_INLINE uint32_t UNALIGNED_LOAD32(const char *p) {
uint32_t result;
memcpy(&result, p, sizeof(result));
return result;
}
#define uint32_in_expected_order(x) (x)
#define uint64_in_expected_order(x) (x)
STATIC_INLINE uint32_t bswap_32(const uint32_t x) {
uint32_t y = x;
for (size_t i = 0; i<sizeof(uint32_t)>> 1; ++i) {
uint32_t d = sizeof(uint32_t) - i - 1;
uint32_t mh = ((uint32_t)0xff) << (d << 3);
uint32_t ml = ((uint32_t)0xff) << (i << 3);
uint32_t h = x & mh;
uint32_t l = x & ml;
uint64_t t = (l << ((d - i) << 3)) | (h >> ((d - i) << 3));
y = t | (y & ~(mh | ml));
}
return y;
}
STATIC_INLINE uint64_t bswap_64(const uint64_t x) {
uint64_t y = x;
for (size_t i = 0; i<sizeof(uint64_t)>> 1; ++i) {
uint64_t d = sizeof(uint64_t) - i - 1;
uint64_t mh = ((uint64_t)0xff) << (d << 3);
uint64_t ml = ((uint64_t)0xff) << (i << 3);
uint64_t h = x & mh;
uint64_t l = x & ml;
uint64_t t = (l << ((d - i) << 3)) | (h >> ((d - i) << 3));
y = t | (y & ~(mh | ml));
}
return y;
}
// Hash 128 input bits down to 64 bits of output.
// This is intended to be a reasonably good hash function.
STATIC_INLINE uint64_t Hash128to64(const uint128_t& x) {
// Murmur-inspired hashing.
const uint64_t kMul = 0x9ddfea08eb382d69ULL;
uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
a ^= (a >> 47);
uint64_t b = (Uint128High64(x) ^ a) * kMul;
b ^= (b >> 47);
b *= kMul;
return b;
}
STATIC_INLINE uint64_t Fetch64(const char *p) {
return uint64_in_expected_order(UNALIGNED_LOAD64(p));
}
STATIC_INLINE uint32_t Fetch32(const char *p) {
return uint32_in_expected_order(UNALIGNED_LOAD32(p));
}
// Bitwise right rotate. Normally this will compile to a single
// instruction, especially if the shift is a manifest constant.
STATIC_INLINE uint64_t Rotate(uint64_t val, int shift) {
// Avoid shifting by 64: doing so yields an undefined result.
return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
}
STATIC_INLINE uint64_t ShiftMix(uint64_t val) {
return val ^ (val >> 47);
}
STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v) {
return Hash128to64(uint128_t(u, v));
}
STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v, uint64_t mul) {
// Murmur-inspired hashing.
uint64_t a = (u ^ v) * mul;
a ^= (a >> 47);
uint64_t b = (v ^ a) * mul;
b ^= (b >> 47);
b *= mul;
return b;
}
// Some primes between 2^63 and 2^64 for various uses.
static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
static const uint64_t k1 = 0xb492b66fbe98f273ULL;
static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
STATIC_INLINE uint64_t HashLen0to16(const char *s, size_t len) {
if (len >= 8) {
uint64_t mul = k2 + len * 2;
uint64_t a = Fetch64(s) + k2;
uint64_t b = Fetch64(s + len - 8);
uint64_t c = Rotate(b, 37) * mul + a;
uint64_t d = (Rotate(a, 25) + b) * mul;
return HashLen16(c, d, mul);
}
if (len >= 4) {
uint64_t mul = k2 + len * 2;
uint64_t a = Fetch32(s);
return HashLen16(len + (a << 3), Fetch32(s + len - 4), mul);
}
if (len > 0) {
uint8_t a = s[0];
uint8_t b = s[len >> 1];
uint8_t c = s[len - 1];
uint32_t y = static_cast<uint32_t>(a) + (static_cast<uint32_t>(b) << 8);
uint32_t z = len + (static_cast<uint32_t>(c) << 2);
return ShiftMix(y * k2 ^ z * k0) * k2;
}
return k2;
}
// This probably works well for 16-byte strings as well, but it may be overkill
// in that case.
STATIC_INLINE uint64_t HashLen17to32(const char *s, size_t len) {
uint64_t mul = k2 + len * 2;
uint64_t a = Fetch64(s) * k1;
uint64_t b = Fetch64(s + 8);
uint64_t c = Fetch64(s + len - 8) * mul;
uint64_t d = Fetch64(s + len - 16) * k2;
return HashLen16(Rotate(a + b, 43) + Rotate(c, 30) + d,
a + Rotate(b + k2, 18) + c, mul);
}
// Return a 16-byte hash for 48 bytes. Quick and dirty.
// Callers do best to use "random-looking" values for a and b.
STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
uint64_t w, uint64_t x, uint64_t y, uint64_t z, uint64_t a, uint64_t b) {
a += w;
b = Rotate(b + a + z, 21);
uint64_t c = a;
a += x;
a += y;
b += Rotate(a, 44);
return make_pair(a + z, b + c);
}
// Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty.
STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
const char* s, uint64_t a, uint64_t b) {
return WeakHashLen32WithSeeds(Fetch64(s),
Fetch64(s + 8),
Fetch64(s + 16),
Fetch64(s + 24),
a,
b);
}
// Return an 8-byte hash for 33 to 64 bytes.
STATIC_INLINE uint64_t HashLen33to64(const char *s, size_t len) {
uint64_t mul = k2 + len * 2;
uint64_t a = Fetch64(s) * k2;
uint64_t b = Fetch64(s + 8);
uint64_t c = Fetch64(s + len - 24);
uint64_t d = Fetch64(s + len - 32);
uint64_t e = Fetch64(s + 16) * k2;
uint64_t f = Fetch64(s + 24) * 9;
uint64_t g = Fetch64(s + len - 8);
uint64_t h = Fetch64(s + len - 16) * mul;
uint64_t u = Rotate(a + g, 43) + (Rotate(b, 30) + c) * 9;
uint64_t v = ((a + g) ^ d) + f + 1;
uint64_t w = bswap_64((u + v) * mul) + h;
uint64_t x = Rotate(e + f, 42) + c;
uint64_t y = (bswap_64((v + w) * mul) + g) * mul;
uint64_t z = e + f + c;
a = bswap_64((x + z) * mul + y) + b;
b = ShiftMix((z + a) * mul + d + h) * mul;
return b + x;
}
uint64_t CityHash64(const char *s, size_t len) {
if (len <= 32) {
if (len <= 16) {
return HashLen0to16(s, len);
} else {
return HashLen17to32(s, len);
}
} else if (len <= 64) {
return HashLen33to64(s, len);
}
// For strings over 64 bytes we hash the end first, and then as we
// loop we keep 56 bytes of state: v, w, x, y, and z.
uint64_t x = Fetch64(s + len - 40);
uint64_t y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
uint64_t z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
pair<uint64_t, uint64_t> v = WeakHashLen32WithSeeds(s + len - 64, len, z);
pair<uint64_t, uint64_t> w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x);
x = x * k1 + Fetch64(s);
// Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
len = (len - 1) & ~static_cast<size_t>(63);
do {
x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
x ^= w.second;
y += v.first + Fetch64(s + 40);
z = Rotate(z + w.first, 33) * k1;
v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
std::swap(z, x);
s += 64;
len -= 64;
} while (len != 0);
return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
HashLen16(v.second, w.second) + x);
}
uint64_t CityHash64WithSeeds(const char *s, size_t len,
uint64_t seed0, uint64_t seed1) {
return HashLen16(CityHash64(s, len) - seed0, seed1);
}
uint64_t CityHash64WithSeed(const char *s, size_t len, uint64_t seed) {
return CityHash64WithSeeds(s, len, k2, seed);
}
int main(int argc, char** argv){
string str = "×îŠ|Hash #1";
uint64_t x = CityHash64(str.c_str(), str.size());
cout<< x <<endl;
return 0;
}