本文将着重探索hash_set比set快速高效的原因,阅读本文前,推荐先阅读本文的姊妹篇《STL系列之六 set与hash_set》
#pragma once #define NULL 0 //简化版hash_table //by MoreWindows( http://blog.csdn.net/MoreWindows ) struct Node { int val; Node *next; Node(int n) { this->val = n; this->next = NULL; } }; class hash_table { public: hash_table(const int ntablesize); ~hash_table(); bool insert(int n); void insert(int *pFirst, int *pLast); bool find(int n); int size(); int HashFun(int n); public: int m_nTableSize; int m_nTableDataCount; Node** m_ppTable; };
//简化版hash_table //by MoreWindows( http://blog.csdn.net/MoreWindows ) #include "hash_table.h" #include <malloc.h> #include <memory.h> hash_table::hash_table(const int ntablesize) { m_nTableSize = ntablesize; m_ppTable = (Node**)malloc(sizeof(Node*) * m_nTableSize); if (m_ppTable == NULL) return ; m_nTableDataCount = 0; memset(m_ppTable, 0, sizeof(Node*) * m_nTableSize); } hash_table::~hash_table() { free(m_ppTable); m_ppTable = NULL; m_nTableDataCount = 0; m_nTableSize = 0; } int inline hash_table::HashFun(int n) { return (n ^ 0xdeadbeef) % m_nTableSize; } int hash_table::size() { return m_nTableDataCount; } bool hash_table::insert(int n) { int key = HashFun(n); //在该链表中查找该数是否已经存在 for (Node *p = m_ppTable[key]; p != NULL; p = p->next) if (p->val == n) return true; //在链表的头部插入 Node *pNode = new Node(n); if (pNode == NULL) return false; pNode->next = m_ppTable[key]; m_ppTable[key] = pNode; m_nTableDataCount++; return true; } bool hash_table::find(int n) { int key = HashFun(n); for (Node *pNode = m_ppTable[key]; pNode != NULL; pNode = pNode->next) if (pNode->val == n) return true; return false; } void hash_table::insert(int *pFirst, int *pLast) { for (int *p = pFirst; p != pLast; p++) this->insert(*p); }
//测试set,hash_set及简化版hash_table // by MoreWindows( http://blog.csdn.net/MoreWindows ) #include <set> #include <hash_set> #include "hash_table.h" #include <iostream> #include <ctime> #include <cstdio> #include <cstdlib> using namespace std; using namespace stdext; //hash_set void PrintfContainerElapseTime(char *pszContainerName, char *pszOperator, long lElapsetime) { printf("%s 的 %s操作 用时 %d毫秒\n", pszContainerName, pszOperator, lElapsetime); } // MAXN个数据 MAXQUERY次查询 const int MAXN = 5000000, MAXQUERY = 5000000; int a[MAXN], query[MAXQUERY]; int main() { printf("set VS hash_set VS hash_table(简化版) 性能测试\n"); printf("数据容量 %d个 查询次数 %d次\n", MAXN, MAXQUERY); const int MAXNUM = MAXN * 4; const int MAXQUERYNUM = MAXN * 4; printf("容器中数据范围 [0, %d) 查询数据范围[0, %d)\n", MAXNUM, MAXQUERYNUM); printf("--by MoreWindows( http://blog.csdn.net/MoreWindows ) --\n\n"); //随机生成在[0, MAXNUM)范围内的MAXN个数 int i; srand((unsigned int)time(NULL)); for (i = 0; i < MAXN; ++i) a[i] = (rand() * rand()) % MAXNUM; //随机生成在[0, MAXQUERYNUM)范围内的MAXQUERY个数 srand((unsigned int)time(NULL)); for (i = 0; i < MAXQUERY; ++i) query[i] = (rand() * rand()) % MAXQUERYNUM; set<int> nset; hash_set<int> nhashset; hash_table nhashtable(MAXN + 123); clock_t clockBegin, clockEnd; //insert printf("-----插入数据-----------\n"); clockBegin = clock(); nset.insert(a, a + MAXN); clockEnd = clock(); printf("set中有数据%d个\n", nset.size()); PrintfContainerElapseTime("set", "insert", clockEnd - clockBegin); clockBegin = clock(); nhashset.insert(a, a + MAXN); clockEnd = clock(); printf("hash_set中有数据%d个\n", nhashset.size()); PrintfContainerElapseTime("hash_set", "insert", clockEnd - clockBegin); clockBegin = clock(); for (i = 0; i < MAXN; i++) nhashtable.insert(a[i]); clockEnd = clock(); printf("hash_table中有数据%d个\n", nhashtable.size()); PrintfContainerElapseTime("Hash_table", "insert", clockEnd - clockBegin); //find printf("-----查询数据-----------\n"); int nFindSucceedCount, nFindFailedCount; nFindSucceedCount = nFindFailedCount = 0; clockBegin = clock(); for (i = 0; i < MAXQUERY; ++i) if (nset.find(query[i]) != nset.end()) ++nFindSucceedCount; else ++nFindFailedCount; clockEnd = clock(); PrintfContainerElapseTime("set", "find", clockEnd - clockBegin); printf("查询成功次数: %d 查询失败次数: %d\n", nFindSucceedCount, nFindFailedCount); nFindSucceedCount = nFindFailedCount = 0; clockBegin = clock(); for (i = 0; i < MAXQUERY; ++i) if (nhashset.find(query[i]) != nhashset.end()) ++nFindSucceedCount; else ++nFindFailedCount; clockEnd = clock(); PrintfContainerElapseTime("hash_set", "find", clockEnd - clockBegin); printf("查询成功次数: %d 查询失败次数: %d\n", nFindSucceedCount, nFindFailedCount); nFindSucceedCount = nFindFailedCount = 0; clockBegin = clock(); for (i = 0; i < MAXQUERY; ++i) if (nhashtable.find(query[i])) ++nFindSucceedCount; else ++nFindFailedCount; clockEnd = clock(); PrintfContainerElapseTime("hash_table", "find", clockEnd - clockBegin); printf("查询成功次数: %d 查询失败次数: %d\n", nFindSucceedCount, nFindFailedCount); return 0; }
// by MoreWindows( http://blog.csdn.net/MoreWindows ) void StatisticHashTable(hash_table &ht) { const int MAXLISTLINE = 100; int i, a[MAXLISTLINE], nExtendListNum; nExtendListNum = 0; memset(a, 0, sizeof(a[0]) * MAXLISTLINE); for (i = 0; i < ht.m_nTableSize; i++) { int sum = 0; for (Node *p = ht.m_ppTable[i]; p != NULL; p = p->next) ++sum; if (sum >= MAXLISTLINE) nExtendListNum++; else a[sum]++; } printf("hash_table中链表长度统计:\n"); for (i = 0; i < MAXLISTLINE; i++) if (a[i] > 0) { printf(" 长度为%d的链表有%d个 这些链表中数据占总数据的%.2lf%%\n", i, a[i], (a[i] * i * 100.0) / ht.size()); } printf(" 长度超过%d的链表有%d个\n", MAXLISTLINE, nExtendListNum); }
hash table -- list with vector of iterators for quick access。
1._Hash类使用的list为双向链表,但在在哈希表中使用普通的单链表就可以了。因此使用STL中的vector再加入《STL系列之八 slist单链表》一文中的slist来实现强化版的hash_table。
//使用vector< slist<T> >为容器的hash_table // by MoreWindows( http://blog.csdn.net/MoreWindows ) template< class T, class container = vector<slist<T>> > class hash_table { public: hash_table(); hash_table(const int ntablesize); ~hash_table(); void clear(); bool insert(T &n); void insert(T *pFirst, T *pLast); bool erase(T &n); void resize(int nNewTableSize); bool find(T &n); int size(); int HashFun(T &n); private: static int findNextPrime(int curPrime); public: int m_nDataCount; int m_nTableSize; container m_Table; static const unsigned int m_primes[50]; }; //素数表 template< class T, class container> const unsigned int hash_table<T, container>::m_primes[50] = { 53, 97, 193, 389, 769, 1453, 3079, 6151, 1289, 24593, 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, 201326611, -1 }; template< class T, class container> int inline hash_table<T, container>::HashFun(T &n) { return (n ^ 0xdeadbeef) % m_nTableSize; } template< class T, class container> hash_table<T, container>::hash_table() { m_nDataCount = 0; m_nTableSize = m_primes[0]; m_Table.resize(m_nTableSize); } template< class T, class container> hash_table<T, container>::hash_table(const int ntablesize) { m_nDataCount = 0; m_nTableSize = ntablesize; m_Table.resize(m_nTableSize); } template< class T, class container> hash_table<T, container>::~hash_table() { clear(); } template< class T, class container> void hash_table<T, container>::clear() { for (int i = 0; i < m_nTableSize; i++) m_Table[i].clear(); m_nDataCount = 0; } template< class T, class container> bool hash_table<T, container>::insert(T &n) { int key = HashFun(n); if (!m_Table[key].find(n)) { m_nDataCount++; m_Table[key].push_front(n); if (m_nDataCount >= m_nTableSize) resize(findNextPrime(m_nTableSize)); } return true; } template< class T, class container> bool hash_table<T, container>::erase(T &n) { int key = HashFun(n); if (m_Table[key].remove(n)) { m_nDataCount--; return true; } else { return false; } } template< class T, class container> void hash_table<T, container>::insert(T *pFirst, T *pLast) { for (T *p = pFirst; p != pLast; p++) this->insert(*p); } template< class T, class container> void hash_table<T, container>::resize(int nNewTableSize) { if (nNewTableSize <= m_nTableSize) return; int nOldTableSize = m_nTableSize; m_nTableSize = nNewTableSize; container tempTable(m_nTableSize); //创建一个更大的表 for (int i = 0; i < nOldTableSize; i++)//将原表中数据重新插入到新表中 { Node<T> *cur = m_Table[i].m_head; while (cur != NULL) { int key = HashFun(cur->val); Node<T> *pNext = cur->next; cur->next = tempTable[key].m_head; tempTable[key].m_head = cur; cur = pNext; } m_Table[i].m_head = NULL; } m_Table.swap(tempTable); } template< class T, class container> int hash_table<T, container>::size() { return m_nDataCount; } template< class T, class container> bool hash_table<T, container>::find(T &n) { int key = HashFun(n); return m_Table[key].find(n); } //在素数表中找到比当前数大的最小数 template< class T, class container> int hash_table<T, container>::findNextPrime(int curPrime) { unsigned int *pStart = (unsigned int *)m_primes; while (*pStart <= curPrime) ++pStart; return *pStart; }
此外,本文所示范的哈希表也与最近流行的NoSql数据库颇有渊源, NoSql数据库也是通过Key-Value方式来访问数据的(访问数据的方式上非常类似哈希表),其查找效率与传统的数据库相比也正如本文中hast_set与set的比较。正因为NoSql数据库在基础数据结构上的天然优势,所以它完全可以支持海量数据的查询修改且对操作性能要求很高场合如微博等。