近期工作中要处理100W条记录,前一个同事使用SQLLite数据作为数据结构存储,采用数据库查询一条记录,时间当然不成问题.后来,我将数据库的数据导出来,发现由原来的150M多变成20M 多,而装载数据库存需要25S,这种用空间换时间的做法未免牺牲太大.所以我想到不用数据库.而用普通的文件存储,要解决查询一条记录肯定要使用某种数据结构.首先想到的是STL中的hash_set,它采用hash_table作为底层数据结构,查询肯定不成问题,可惜初始化所需时间无法接受,我想可能hash_table的内部表格重建过于复杂,而我所要的并不复杂,于是我想写一个简单的hash_table,只要能进行插入,查询就可以,而且只针对字符串就行了,所以查看一些资料,写了一个简单的Hash_Table.
从底层来看,Hash_Table与stl中的hash_table的区别在于,内存管理不一样,hash_table采用std::alloc,Hash_Table直接使用new.还有一点不同的表格重建的策略不一样.
template
class CMyHashTable
{
public:
typedef const Value* ValPointer;
typedef const Value& ValRef;
struct ValNode
{
Value Data;
DWORD Key;
ValNode* pNext;
};
typedef ValNode* NodePointer;
typedef const ValNode* NodeCPointer;
typedef ValNode& NodeRef;
typedef const ValNode& NodeCRef;
protected:
const int* m_boxSize;
int m_boxCount;
int m_level;
int m_nbox; //桶的大小,初始为1024
int m_limh;
int m_liml;
NodePointer* m_boxes;
int m_size; //元素的问个数
public:
CMyHashTable(int boxCount, const int* boxSize, int initLevel);
~CMyHashTable();
public:
ValPointer find(ValRef value) const
{
ValNode s;
_fill_rec(s, value);
return &_find(s)->Data;
}
bool insert(ValRef value)
{
ValNode s;
_fill_rec(s, value);
if (_find(s) != NULL)
return false;
_insert_rec(s);
return true;
}
void insert_force(ValRef value)
{
ValNode s;
_fill_rec(s, value);
_insert_rec(s);
}
bool remove(ValRef value)
{
ValNode s;
_fill_rec(s, value);
bool rtv = _find_remove(s);
if (rtv)
{
--m_size;
if (m_size < m_liml)
decrease_level();
}
return rtv;
}
int size() const { return m_size; }
int capacity() const { return m_limh; }
void for_each(void (*fp)(ValRef));
protected:
inline DWORD _short_key(DWORD key) const
{
return key & (m_nbox - 1);
}
inline static DWORD _short_key(DWORD key, DWORD nbox)
{
return key & (nbox - 1);
}
NodeCPointer _find(NodeCRef) const;
bool _find_remove(NodeCRef);
void _insert_rec(NodeCRef rec)
{
++m_size;
NodePointer pnew = new ValNode;
*pnew = rec;
DWORD short_key = _short_key(rec.Key);
pnew->pNext = m_boxes[short_key];
m_boxes[short_key] = pnew;
if (m_size >= m_limh)
increase_level();
}
void switch_level(int new_level);
void increase_level()
{
if (m_level < m_boxCount - 1)
switch_level(m_level + 1);
else
m_limh = 0x20000000; // Unable to increase level
}
void decrease_level()
{
if (m_level > 0)
switch_level(m_level - 1);
else
m_liml = 0; // Unable to decrease level
}
static void _fill_rec(NodeRef s, ValRef value)
{
s.Data = value;
s.Key= HashFun()(value);
s.pNext = NULL;
}
};
template
CMyHashTable ::CMyHashTable(int boxCount, const int* boxSize, int initLevel)
:m_boxCount(boxCount), m_boxSize(boxSize)
{
m_level = initLevel;
m_limh = m_nbox = m_boxSize[m_level];
m_liml = initLevel > 0 ? m_boxSize[initLevel - 1] / 2 : 0;
m_boxes = new NodePointer[m_nbox];
m_size = 0;
memset(m_boxes, 0, sizeof(NodePointer) * m_nbox);
}
template
CMyHashTable ::~CMyHashTable()
{
for (int i = 0; i < m_nbox; ++i)
{
NodePointer p = m_boxes[i];
while (p != NULL)
{
NodePointer q = p->pNext;
delete p;
p = q;
}
}
delete[] m_boxes;
}
template
typename CMyHashTable ::NodeCPointer CMyHashTable ::_find(NodeCRef r) const
{
DWORD short_key = _short_key(r.Key);
NodePointer p = m_boxes[short_key];
while (p != NULL)
{
if (p->Key == r.Key)
{
if (EqualKey()(p->Data, r.Data))
return p;
}
p = p->pNext;
}
return NULL;
}
template
bool CMyHashTable ::_find_remove(NodeCRef r)
{
DWORD short_key = _short_key(r.Key);
NodePointer* p = &m_boxes[short_key];
while (*p != NULL)
{
if ((*p)->Key == r.Key)
{
if (EqualKey()((*p)->value, r.value))
{
NodePointer q = *p;
*p = q->next;
delete q;
return true;
}
}
p = &(*p)->next;
}
return false;
}
template
void CMyHashTable ::switch_level(int new_level)
{
int new_n = m_boxSize[m_level = new_level];
NodePointer* new_boxes = new NodePointer[new_n];
memset(new_boxes, 0, sizeof(NodePointer) * new_n);
for (int i = 0; i < m_nbox; ++i)
{
NodePointer p = m_boxes[i], q;
while (p != NULL)
{
q = p->pNext;
DWORD sht_key = _short_key(p->Key, new_n);
p->pNext = new_boxes[sht_key];
new_boxes[sht_key] = p;
p = q;
}
}
m_limh = m_nbox = new_n;
m_liml = m_level > 0 ? m_boxSize[m_level - 1] / 2 : 0;
delete[] m_boxes;
m_boxes = new_boxes;
}
另外使用这个的两个仿函数
struct EPSTR
{
bool operator()(const string& s1, const string& s2) const
{
return strcmp(s1.c_str(), s2.c_str()) == 0;
}
};
struct HASH_FUN
{
size_t operator()(const string& str) const
{
const char* psz = str.c_str();
unsigned long _hash = 0;
for (; *psz != 0; ++psz)
{
_hash *= 16777619;
_hash ^= (unsigned long) (unsigned short&)*psz;
}
return _hash;
}
};
我使用这个HASH_TABLE装载100W条数据,只要16S,查询一条记录在1MS以内.应该说比使用数据库要好得多.