最近需要研究下文本搜索和字符串匹配算法,想到哈希的搜索性能不错,于是查找有关哈希搜索方面的算法,有幸见到rainleaf的大 作,确实不错,转载至此供大家学习进步!
原文如下:(原文地址:http://blog.csdn.net/eaglewood2005/archive/2009/07 /30/4394583.aspx )
近期由于需要,研究了魔兽文件打包管理器的相关算法,重点对其文件索引表的生成和查找进行了研究:采用哈希表进行,在冲突方面的处理方 面,采用线性探测再散列。在添加和查找过程中进行了三次哈希,第一个哈希值用来查找,后两个哈希值用来校验,这样可以大大减少冲突的几率。
这里对其进行了简单的封装,扩展时,仅仅需要对结构体进行扩展即可。更为详细的说明,参考代码:【转载请保留版权,谢谢】
一、类声明头文件
-
-
-
-
-
-
-
-
-
-
-
- #define MAXFILENAME 255 // 最大文件 名长度
- #define MAXTABLELEN 1024 // 默 认哈希索引表大小
-
-
-
- #define DEBUGTEST 1
-
-
-
- typedef struct
- {
- long nHashA;
- long nHashB;
- bool bExists;
- char test_filename[MAXFILENAME];
-
- } MPQHASHTABLE;
-
-
-
- class CHashAlgo
- {
- public :
-
- #if DEBUGTEST
- long testid;
- #endif
-
- CHashAlgo( const long nTableLength = MAXTABLELEN )
- {
- prepareCryptTable();
- m_tablelength = nTableLength;
-
- m_HashIndexTable = new MPQHASHTABLE[nTableLength];
- for ( int i = 0; i < nTableLength; i++ )
- {
- m_HashIndexTable[i].nHashA = -1;
- m_HashIndexTable[i].nHashB = -1;
- m_HashIndexTable[i].bExists = false ;
- m_HashIndexTable[i].test_filename[0] = '/0' ;
- }
- }
-
- void prepareCryptTable();
-
- unsigned long HashString( char *lpszFileName, unsigned long dwHashType);
- long GetHashTablePos( char *lpszString );
- bool SetHashTable( char *lpszString );
-
- unsigned long GetTableLength( void );
- void SetTableLength( const unsigned long nLength );
-
- ~CHashAlgo()
- {
- if ( NULL != m_HashIndexTable )
- {
- delete []m_HashIndexTable;
- m_HashIndexTable = NULL;
- m_tablelength = 0;
- }
- }
- protected :
-
- private :
- unsigned long cryptTable[0x500];
- unsigned long m_tablelength;
- MPQHASHTABLE *m_HashIndexTable;
- };
view plain copy to clipboard print ?
-
-
-
-
-
-
-
-
-
-
- #define MAXFILENAME 255 // 最 大文件名长度
- #define MAXTABLELEN 1024 // 默认哈希索引表大小
-
-
- #define DEBUGTEST 1
-
-
- typedef struct
- {
- long nHashA;
- long nHashB;
- bool bExists;
- char test_filename[MAXFILENAME];
-
- } MPQHASHTABLE;
-
-
- class CHashAlgo
- {
- public :
- #if DEBUGTEST
- long testid;
- #endif
- CHashAlgo( const long nTableLength = MAXTABLELEN )
- {
- prepareCryptTable();
- m_tablelength = nTableLength;
-
- m_HashIndexTable = new MPQHASHTABLE[nTableLength];
- for ( int i = 0; i < nTableLength; i++ )
- {
- m_HashIndexTable[i].nHashA = -1;
- m_HashIndexTable[i].nHashB = -1;
- m_HashIndexTable[i].bExists = false ;
- m_HashIndexTable[i].test_filename[0] = '/0' ;
- }
- }
- void prepareCryptTable();
- unsigned long HashString( char *lpszFileName, unsigned long dwHashType);
- long GetHashTablePos( char *lpszString );
- bool SetHashTable( char *lpszString );
- unsigned long GetTableLength( void );
- void SetTableLength( const unsigned long nLength );
- ~CHashAlgo()
- {
- if ( NULL != m_HashIndexTable )
- {
- delete []m_HashIndexTable;
- m_HashIndexTable = NULL;
- m_tablelength = 0;
- }
- }
- protected :
- private :
- unsigned long cryptTable[0x500];
- unsigned long m_tablelength;
- MPQHASHTABLE *m_HashIndexTable;
- };
///////////////////////////////////////////////////////////////////////////// // Name: HashAlgo.h // Purpose: 使用魔兽Hash算法,实现索引表的填充和查找功能。 // Author: 陈相礼 // Modified by: // Created: 07/30/09 // RCS-ID: $Id: treetest.h 43021 2009-07-30 16:36:51Z VZ $ // Copyright: (C) Copyright 2009, TSong Corporation, All Rights Reserved. // Licence: ///////////////////////////////////////////////////////////////////////////// #define MAXFILENAME 255 // 最大文件名长度 #define MAXTABLELEN 1024 // 默认哈希索引表大小 ////////////////////////////////////////////////////////////////////////// // 测试宏定义,正式使用时关闭 #define DEBUGTEST 1 ////////////////////////////////////////////////////////////////////////// // 哈希索引表定义 typedef struct { long nHashA; long nHashB; bool bExists; char test_filename[MAXFILENAME]; // ...... } MPQHASHTABLE; ////////////////////////////////////////////////////////////////////////// // 对哈希索引表的算法进行封装 class CHashAlgo { public: #if DEBUGTEST long testid; // 测试之用 #endif CHashAlgo( const long nTableLength = MAXTABLELEN )// 创建指定大小的哈希索引表,不带参数的构造函数创建默认大小的哈希索引表 { prepareCryptTable(); m_tablelength = nTableLength; m_HashIndexTable = new MPQHASHTABLE[nTableLength]; for ( int i = 0; i < nTableLength; i++ ) { m_HashIndexTable[i].nHashA = -1; m_HashIndexTable[i].nHashB = -1; m_HashIndexTable[i].bExists = false; m_HashIndexTable[i].test_filename[0] = '/0'; } } void prepareCryptTable(); // 对哈希索引表预处理 unsigned long HashString(char *lpszFileName, unsigned long dwHashType); // 求取哈希值 long GetHashTablePos( char *lpszString ); // 得到在定长表中的位置 bool SetHashTable( char *lpszString ); // 将字符串散列到哈希表中 unsigned long GetTableLength(void); void SetTableLength( const unsigned long nLength ); ~CHashAlgo() { if ( NULL != m_HashIndexTable ) { delete []m_HashIndexTable; m_HashIndexTable = NULL; m_tablelength = 0; } } protected: private: unsigned long cryptTable[0x500]; unsigned long m_tablelength; // 哈希索引表长度 MPQHASHTABLE *m_HashIndexTable; };
二、类实现文件
-
-
-
-
-
-
-
-
-
-
-
- #include "windows.h"
- #include "HashAlgo.h"
-
-
-
- void CHashAlgo::prepareCryptTable()
- {
- unsigned long seed = 0x00100001, index1 = 0, index2 = 0, i;
-
- for ( index1 = 0; index1 < 0x100; index1++ )
- {
- for ( index2 = index1, i = 0; i < 5; i++, index2 += 0x100 )
- {
- unsigned long temp1, temp2;
- seed = (seed * 125 + 3) % 0x2AAAAB;
- temp1 = (seed & 0xFFFF) << 0x10;
- seed = (seed * 125 + 3) % 0x2AAAAB;
- temp2 = (seed & 0xFFFF);
- cryptTable[index2] = ( temp1 | temp2 );
- }
- }
- }
-
-
-
- unsigned long CHashAlgo::HashString( char *lpszFileName, unsigned long dwHashType)
- {
- unsigned char *key = (unsigned char *)lpszFileName;
- unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE;
- int ch;
-
- while (*key != 0)
- {
- ch = toupper(*key++);
-
- seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2);
- seed2 = ch + seed1 + seed2 + (seed2 << 5) + 3;
- }
- return seed1;
- }
-
-
-
- long CHashAlgo::GetHashTablePos( char *lpszString)
-
- {
- const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
- unsigned long nHash = HashString(lpszString, HASH_OFFSET);
- unsigned long nHashA = HashString(lpszString, HASH_A);
- unsigned long nHashB = HashString(lpszString, HASH_B);
- unsigned long nHashStart = nHash % m_tablelength,
- nHashPos = nHashStart;
-
- while ( m_HashIndexTable[nHashPos].bExists)
- {
- if (m_HashIndexTable[nHashPos].nHashA == nHashA && m_HashIndexTable[nHashPos].nHashB == nHash)
- return nHashPos;
- else
- nHashPos = (nHashPos + 1) % m_tablelength;
-
- if (nHashPos == nHashStart)
- break ;
- }
-
- return -1;
- }
-
-
- bool CHashAlgo::SetHashTable( char *lpszString )
- {
- const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
- unsigned long nHash = HashString(lpszString, HASH_OFFSET);
- unsigned long nHashA = HashString(lpszString, HASH_A);
- unsigned long nHashB = HashString(lpszString, HASH_B);
- unsigned long nHashStart = nHash % m_tablelength,
- nHashPos = nHashStart;
-
- while ( m_HashIndexTable[nHashPos].bExists)
- {
- nHashPos = (nHashPos + 1) % m_tablelength;
- if (nHashPos == nHashStart)
- {
-
- #if DEBUGTEST
- testid = -1;
- #endif
-
- return false ;
- }
- }
- m_HashIndexTable[nHashPos].bExists = true ;
- m_HashIndexTable[nHashPos].nHashA = nHashA;
- m_HashIndexTable[nHashPos].nHashB = nHash;
- strcpy( m_HashIndexTable[nHashPos].test_filename, lpszString );
-
- #if DEBUGTEST
- testid = nHashPos;
- #endif
-
- return true ;
- }
-
-
-
- unsigned long CHashAlgo::GetTableLength( void )
- {
- return m_tablelength;
- }
-
-
-
- void CHashAlgo::SetTableLength( const unsigned long nLength )
- {
- m_tablelength = nLength;
- return ;
- }
view plain copy to clipboard print ?
-
-
-
-
-
-
-
-
-
-
- #include "windows.h"
- #include "HashAlgo.h"
-
-
- void CHashAlgo::prepareCryptTable()
- {
- unsigned long seed = 0x00100001, index1 = 0, index2 = 0, i;
- for ( index1 = 0; index1 < 0x100; index1++ )
- {
- for ( index2 = index1, i = 0; i < 5; i++, index2 += 0x100 )
- {
- unsigned long temp1, temp2;
- seed = (seed * 125 + 3) % 0x2AAAAB;
- temp1 = (seed & 0xFFFF) << 0x10;
- seed = (seed * 125 + 3) % 0x2AAAAB;
- temp2 = (seed & 0xFFFF);
- cryptTable[index2] = ( temp1 | temp2 );
- }
- }
- }
-
-
- unsigned long CHashAlgo::HashString( char *lpszFileName, unsigned long dwHashType)
- {
- unsigned char *key = (unsigned char *)lpszFileName;
- unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE;
- int ch;
- while (*key != 0)
- {
- ch = toupper(*key++);
- seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2);
- seed2 = ch + seed1 + seed2 + (seed2 << 5) + 3;
- }
- return seed1;
- }
-
-
- long CHashAlgo::GetHashTablePos( char *lpszString)
- {
- const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
- unsigned long nHash = HashString(lpszString, HASH_OFFSET);
- unsigned long nHashA = HashString(lpszString, HASH_A);
- unsigned long nHashB = HashString(lpszString, HASH_B);
- unsigned long nHashStart = nHash % m_tablelength,
- nHashPos = nHashStart;
- while ( m_HashIndexTable[nHashPos].bExists)
- {
- if (m_HashIndexTable[nHashPos].nHashA == nHashA && m_HashIndexTable[nHashPos].nHashB == nHash)
- return nHashPos;
- else
- nHashPos = (nHashPos + 1) % m_tablelength;
- if (nHashPos == nHashStart)
- break ;
- }
- return -1;
- }
-
-
- bool CHashAlgo::SetHashTable( char *lpszString )
- {
- const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2;
- unsigned long nHash = HashString(lpszString, HASH_OFFSET);
- unsigned long nHashA = HashString(lpszString, HASH_A);
- unsigned long nHashB = HashString(lpszString, HASH_B);
- unsigned long nHashStart = nHash % m_tablelength,
- nHashPos = nHashStart;
- while ( m_HashIndexTable[nHashPos].bExists)
- {
- nHashPos = (nHashPos + 1) % m_tablelength;
- if (nHashPos == nHashStart)
- {
- #if DEBUGTEST
- testid = -1;
- #endif
- return false ;
- }
- }
- m_HashIndexTable[nHashPos].bExists = true ;
- m_HashIndexTable[nHashPos].nHashA = nHashA;
- m_HashIndexTable[nHashPos].nHashB = nHash;
- strcpy( m_HashIndexTable[nHashPos].test_filename, lpszString );
- #if DEBUGTEST
- testid = nHashPos;
- #endif
- return true ;
- }
-
-
- unsigned long CHashAlgo::GetTableLength( void )
- {
- return m_tablelength;
- }
-
-
- void CHashAlgo::SetTableLength( const unsigned long nLength )
- {
- m_tablelength = nLength;
- return ;
- }
///////////////////////////////////////////////////////////////////////////// // Name: HashAlgo.cpp // Purpose: 使用魔兽Hash算法,实现索引表的填充和查找功能。 // Author: 陈相礼 // Modified by: // Created: 07/30/09 // RCS-ID: $Id: treetest.h 43021 2009-07-30 16:36:51Z VZ $ // Copyright: (C) Copyright 2009, TSong Corporation, All Rights Reserved. // Licence: ///////////////////////////////////////////////////////////////////////////// #include "windows.h" #include "HashAlgo.h" ////////////////////////////////////////////////////////////////////////// // 预处理 void CHashAlgo::prepareCryptTable() { unsigned long seed = 0x00100001, index1 = 0, index2 = 0, i; for( index1 = 0; index1 < 0x100; index1++ ) { for( index2 = index1, i = 0; i < 5; i++, index2 += 0x100 ) { unsigned long temp1, temp2; seed = (seed * 125 + 3) % 0x2AAAAB; temp1 = (seed & 0xFFFF) << 0x10; seed = (seed * 125 + 3) % 0x2AAAAB; temp2 = (seed & 0xFFFF); cryptTable[index2] = ( temp1 | temp2 ); } } } ////////////////////////////////////////////////////////////////////////// // 求取哈希值 unsigned long CHashAlgo::HashString(char *lpszFileName, unsigned long dwHashType) { unsigned char *key = (unsigned char *)lpszFileName; unsigned long seed1 = 0x7FED7FED, seed2 = 0xEEEEEEEE; int ch; while(*key != 0) { ch = toupper(*key++); seed1 = cryptTable[(dwHashType << 8) + ch] ^ (seed1 + seed2); seed2 = ch + seed1 + seed2 + (seed2 << 5) + 3; } return seed1; } ////////////////////////////////////////////////////////////////////////// // 得到在定长表中的位置 long CHashAlgo::GetHashTablePos(char *lpszString) { const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2; unsigned long nHash = HashString(lpszString, HASH_OFFSET); unsigned long nHashA = HashString(lpszString, HASH_A); unsigned long nHashB = HashString(lpszString, HASH_B); unsigned long nHashStart = nHash % m_tablelength, nHashPos = nHashStart; while ( m_HashIndexTable[nHashPos].bExists) { if (m_HashIndexTable[nHashPos].nHashA == nHashA && m_HashIndexTable[nHashPos].nHashB == nHash) return nHashPos; else nHashPos = (nHashPos + 1) % m_tablelength; if (nHashPos == nHashStart) break; } return -1; //没有找到 } ////////////////////////////////////////////////////////////////////////// // 通过传入字符串,将相应的表项散列到索引表相应位置中去 bool CHashAlgo::SetHashTable( char *lpszString ) { const unsigned long HASH_OFFSET = 0, HASH_A = 1, HASH_B = 2; unsigned long nHash = HashString(lpszString, HASH_OFFSET); unsigned long nHashA = HashString(lpszString, HASH_A); unsigned long nHashB = HashString(lpszString, HASH_B); unsigned long nHashStart = nHash % m_tablelength, nHashPos = nHashStart; while ( m_HashIndexTable[nHashPos].bExists) { nHashPos = (nHashPos + 1) % m_tablelength; if (nHashPos == nHashStart) { #if DEBUGTEST testid = -1; #endif return false; } } m_HashIndexTable[nHashPos].bExists = true; m_HashIndexTable[nHashPos].nHashA = nHashA; m_HashIndexTable[nHashPos].nHashB = nHash; strcpy( m_HashIndexTable[nHashPos].test_filename, lpszString ); #if DEBUGTEST testid = nHashPos; #endif return true; } ////////////////////////////////////////////////////////////////////////// // 取得哈希索引表长 unsigned long CHashAlgo::GetTableLength(void) { return m_tablelength; } ////////////////////////////////////////////////////////////////////////// // 设置哈希索引表长 void CHashAlgo::SetTableLength( const unsigned long nLength ) { m_tablelength = nLength; return; }
三、测试主文件
-
-
-
-
-
-
-
-
-
-
-
-
-
- #define TESTNUM 32
-
- #include <iostream>
- #include <fstream>
- #include "HashAlgo.h"
-
- using namespace std;
-
-
-
- int main( int argc, char **argv )
- {
- CHashAlgo hash_test( TESTNUM );
-
- cout << "取得初始化散列索引表长为:" << hash_test.GetTableLength() << endl;
-
- bool is_success = hash_test.SetHashTable( "test" );
- if ( is_success )
- {
- cout << "散列结果一: 成功!" << endl;
- }
- else
- {
- cout << "散列结果一: 失败!" << endl;
- }
-
- is_success = hash_test.SetHashTable( " 测试" );
- if ( is_success )
- {
- cout << "散列结果二: 成功!" << endl;
- }
- else
- {
- cout << "散列结果二: 失败!" << endl;
- }
-
- long pos = hash_test.GetHashTablePos( "test" );
- cout << "查找测试字符 串:/"test/" 的散列位置:" << pos << endl;
- pos = hash_test.GetHashTablePos( "测试" );
- cout << "查找测试字符串:“测 试” 的散列位置:" << pos << endl;
-
-
-
- for ( int i = 0; i < TESTNUM; i++ )
- {
- char buff[32];
- sprintf(buff, "abcdefg%d." , i);
- is_success = hash_test.SetHashTable(buff);
- is_success ? cout << buff << "散列结果:成功!位置:" << hash_test.testid << endl : cout << buff << "散列结果:失败!" << endl;
- }
- system( "pause" );
-
-
- for ( int i = 0; i < TESTNUM; i++ )
- {
- char buff[32];
- sprintf(buff, "abcdefg%d." , i);
- pos = hash_test.GetHashTablePos( buff );
- pos != -1 ? cout << "查找测试字 符串:" << buff << " 的散列位置:" << pos << endl : cout << buff << "存在冲突!" << endl;
- }
-
- system( "pause" );
- return 0;
- }