time33 哈希函数,又叫 DJBX33A,Bernstein's hash
转载自:
http://www.cnblogs.com/napoleon_liu/articles/1911571.html
php, apache, perl, bsddb都使用time33哈希.
最简单的版本
uint32_t time33(
char
const
*
str,
int
len)
{
unsigned long hash = 0;
for (int i = 0; i < len; i++) {
hash = hash *33 + (unsigned long) str[i];
}
return hash;
}
{
unsigned long hash = 0;
for (int i = 0; i < len; i++) {
hash = hash *33 + (unsigned long) str[i];
}
return hash;
}
这个版本最可以体现time33的算法思路,够简单。
把乘法操作换成位操作
unsigned
long
time33(
char
const
*
str,
int
len)
{
unsigned long hash = 0;
for (int i = 0; i < len; i++) {
hash = ((hash <<5) + hash) + (unsigned long) str[i];
}
return hash;
}
{
unsigned long hash = 0;
for (int i = 0; i < len; i++) {
hash = ((hash <<5) + hash) + (unsigned long) str[i];
}
return hash;
}
59个字符1000 0000次运行(gcc没有开启优化,因为开了优化后两个函数的实际代码会一样)
第一个:
real 0m4.389s
user 0m4.388s
sys 0m0.000s
user 0m4.388s
sys 0m0.000s
第二个:
real 0m4.137s
user 0m4.120s
sys 0m0.000s
user 0m4.120s
sys 0m0.000s
gcc –O2优化后是
real 0m1.367s
user 0m1.360s
sys 0m0.000s
user 0m1.360s
sys 0m0.000s
php版本
inline unsigned time33(char
const
*
str
,
int len)
{
unsigned long hash = 5381 ;
/* variant with the hash unrolled eight times */
for (; len >= 8 ; len -= 8 ) {
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
}
switch (len) {
case 7 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 6 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 5 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 4 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 3 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 2 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 1 : hash = ((hash << 5 ) + hash) + * str ++ ; break ;
case 0 : break ;
}
return hash;
}
{
unsigned long hash = 5381 ;
/* variant with the hash unrolled eight times */
for (; len >= 8 ; len -= 8 ) {
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
hash = ((hash << 5 ) + hash) + * str ++ ;
}
switch (len) {
case 7 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 6 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 5 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 4 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 3 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 2 : hash = ((hash << 5 ) + hash) + * str ++ ; /* fallthrough */
case 1 : hash = ((hash << 5 ) + hash) + * str ++ ; break ;
case 0 : break ;
}
return hash;
}
59个字符,1000 0000次
real 0m1.088s
user 0m1.068s
sys 0m0.000s
user 0m1.068s
sys 0m0.000s
速度提升主要在循环展开, 对于短字符,这个是不明显的。
php版本的hash初始值是5381, 这个
Magic Constant
5381
:
1 . odd number
2 . prime number
3 . deficient number
4 . 001 / 010 / 100 / 000 / 101 b
1 . odd number
2 . prime number
3 . deficient number
4 . 001 / 010 / 100 / 000 / 101 b
Apache版本
unsigned
long
time33(
char
const
*
str,
int
*
len)
{
unsigned long hash = 0;
const char *p=str;
if (*len<=0) {
for(p = str; *p; p++) {
hash = hash * 33 + *p;
}
*len = p - str;
}
else {
int i = *len;
for (p = str;i; i--, p++) {
hash = hash * 33 + *p;
}
}
return hash;
}
{
unsigned long hash = 0;
const char *p=str;
if (*len<=0) {
for(p = str; *p; p++) {
hash = hash * 33 + *p;
}
*len = p - str;
}
else {
int i = *len;
for (p = str;i; i--, p++) {
hash = hash * 33 + *p;
}
}
return hash;
}
测试结果
real 0m1.418s
user 0m1.412s
sys 0m0.004s
user 0m1.412s
sys 0m0.004s
综上,我的改进版本
#define
likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
// php版本
unsigned long time33( char const * str, int len =- 1 )
{
unsigned long hash = 5381;
/**//* variant with the hash unrolled eight times */
char const *p = str;
if (unlikely(len<0)) {
for(; *p; p++) {
hash = hash * 33 + *p;
}
return hash;
}
#define TIME33_HASH_MIXED_CH() hash = ((hash<<5)+hash) + *p++
for (; len >= 8; len -= 8) {
TIME33_HASH_MIXED_CH(); //1
TIME33_HASH_MIXED_CH(); //2
TIME33_HASH_MIXED_CH(); //3
TIME33_HASH_MIXED_CH(); //4
TIME33_HASH_MIXED_CH(); //5
TIME33_HASH_MIXED_CH(); //6
TIME33_HASH_MIXED_CH(); //7
TIME33_HASH_MIXED_CH(); //8
}
switch (len) {
case 7: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 6: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 5: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 4: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 3: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 2: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 1: TIME33_HASH_MIXED_CH(); break;
case 0: break;
}
return hash;
}
#undef TIME33_HASH_MIXED_CH
#define unlikely(x) __builtin_expect((x),0)
// php版本
unsigned long time33( char const * str, int len =- 1 )
{
unsigned long hash = 5381;
/**//* variant with the hash unrolled eight times */
char const *p = str;
if (unlikely(len<0)) {
for(; *p; p++) {
hash = hash * 33 + *p;
}
return hash;
}
#define TIME33_HASH_MIXED_CH() hash = ((hash<<5)+hash) + *p++
for (; len >= 8; len -= 8) {
TIME33_HASH_MIXED_CH(); //1
TIME33_HASH_MIXED_CH(); //2
TIME33_HASH_MIXED_CH(); //3
TIME33_HASH_MIXED_CH(); //4
TIME33_HASH_MIXED_CH(); //5
TIME33_HASH_MIXED_CH(); //6
TIME33_HASH_MIXED_CH(); //7
TIME33_HASH_MIXED_CH(); //8
}
switch (len) {
case 7: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 6: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 5: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 4: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 3: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 2: TIME33_HASH_MIXED_CH(); /**//* fallthrough */
case 1: TIME33_HASH_MIXED_CH(); break;
case 0: break;
}
return hash;
}
#undef TIME33_HASH_MIXED_CH
测试结果
real 0m1.072s
user 0m1.064s
sys 0m0.000s
测试过, 重复率在 1/2000。
user 0m1.064s
sys 0m0.000s
为什么是33的倍数, PHP中注释是
DJBX33A (Daniel J. Bernstein, Times
33
with Addition)
This is Daniel J. Bernstein ' s popular `times 33 ' hash function as
posted by him years ago on comp.lang.c. It basically uses a function
like ``hash(i) = hash(i - 1 ) * 33 + str[i] '' . This is one of the best
known hash functions for strings. Because it is both computed very
fast and distributes very well.
The magic of number 33 , i.e. why it works better than many other
constants, prime or not, has never been adequately explained by
anyone. So I try an explanation: if one experimentally tests all
multipliers between 1 and 256 ( as RSE did now) one detects that even
numbers are not useable at all. The remaining 128 odd numbers
(except for the number 1 ) work more or less all equally well. They
all distribute in an acceptable way and this way fill a hash table
with an average percent of approx. 86 % .
If one compares the Chi ^ 2 values of the variants, the number 33 not
even has the best value. But the number 33 and a few other equally
good numbers like 17 , 31 , 63 , 127 and 129 have nevertheless a great
advantage to the remaining numbers in the large set of possible
multipliers: their multiply operation can be replaced by a faster
operation based on just one shift plus either a single addition
or subtraction operation. And because a hash function has to both
distribute good _and_ has to be very fast to compute, those few
numbers should be preferred and seems to be the reason why Daniel J.
Bernstein also preferred it.
-- Ralf S. Engelschall [email protected]
This is Daniel J. Bernstein ' s popular `times 33 ' hash function as
posted by him years ago on comp.lang.c. It basically uses a function
like ``hash(i) = hash(i - 1 ) * 33 + str[i] '' . This is one of the best
known hash functions for strings. Because it is both computed very
fast and distributes very well.
The magic of number 33 , i.e. why it works better than many other
constants, prime or not, has never been adequately explained by
anyone. So I try an explanation: if one experimentally tests all
multipliers between 1 and 256 ( as RSE did now) one detects that even
numbers are not useable at all. The remaining 128 odd numbers
(except for the number 1 ) work more or less all equally well. They
all distribute in an acceptable way and this way fill a hash table
with an average percent of approx. 86 % .
If one compares the Chi ^ 2 values of the variants, the number 33 not
even has the best value. But the number 33 and a few other equally
good numbers like 17 , 31 , 63 , 127 and 129 have nevertheless a great
advantage to the remaining numbers in the large set of possible
multipliers: their multiply operation can be replaced by a faster
operation based on just one shift plus either a single addition
or subtraction operation. And because a hash function has to both
distribute good _and_ has to be very fast to compute, those few
numbers should be preferred and seems to be the reason why Daniel J.
Bernstein also preferred it.
-- Ralf S. Engelschall [email protected]
其它倍数
Ngix使用的是 time31
Tokyo Cabinet使用的是 time37
Bob在他的文章说,小写英文词汇适合33, 大小写混合使用65。time33比较适合的是英文词汇的hash.