搜狗细胞词库处理代码(可用于scel转txt)
2010-07-05 18:46
http://hi.baidu.com/guangbinw/blog/item/86d61009de07f28fd0581b11.html
今天先贴个简单代码,稍后再详细叙述……
基于QT实现,主要是考虑Unicode字符处理的方便,
可以稍加处理用于C或C++语言。
// 取连续两字节,转换为short类型的值,字节顺序是低字节-高字节
inline unsigned short GetUShort(QByteArray &fData, int startPos)
{
unsigned char low = fData.at(startPos);
unsigned char high = fData.at(startPos + 1);
unsigned short st = low + (high * 256);
return st;
}
//把 Unicode编码的值转换为字符串
inline QString GetStrvalue(QByteArray &fData, int startPos, int len)
{
QString temp;
for (int i = 0 ;i < len ; i+=2)
{
unsigned short st = GetUShort(fData,startPos + i);
QChar ch = QChar(st);
temp.append(ch);
if (st == 0)
{
break;
}
}
return temp;
}
//临时保存结果
class CHanzi
{
public:
void SetPy(QByteArray &py)
{
m_py = py;
}
void SetHz(QByteArray &hz)
{
m_hz = GetStrvalue(hz,0,hz.size());
}
void Debug()
{
QString py = "";
for (int i = 0 ; i < m_py.length() ; i += 2)
{
py += pyList.at(GetUShort(m_py,i));
}
qDebug() << m_hz << ":" << py;
}
private:
QByteArray m_py;
QString m_hz;
};
//1、读取拼音表
inline void ReadPyTable(QByteArray &fData,QList<QString> &pyList)
{
int startPos = 0x1540;
QByteArray fFlag = fData.mid(startPos,4);
if( fData.data() == "/x9D/x01/x00/x00")
{
qDebug() << "读取词库拼音表失败!";
return;
}
int pos = 4;
while (true)
{
//取bit的索引号,用于表示一个拼音信息
unsigned short n = GetUShort(fData,startPos + pos);
pos += 2;
//取bit的拼音长度,字母数的倍
unsigned short len = GetUShort(fData,startPos + pos);
pos += 2;
//每个字母占bit,Unicode编码
QString py = GetStrvalue(fData,startPos + pos,len);
qDebug() << n << ":" << py;
pyList.push_back(py);
pos += len;
//zuo是最后一个拼音,处理完成
if (py == "zuo" || pos + startPos >= fData.length())
{
break;
}
}
}
//2、读取汉字表
inline void ReadHzTable(QByteArray &fData,QList<CHanzi> &hzList)
{
int startPos = 0x2628;
int pos = 0;
while (true)
{
//偏移加权,16bit,用于计算下一记录的位置
unsigned short offset = GetUShort(fData,startPos + pos) - 1;
pos += 2;
//拼音长度,16bit,每个拼音用一个bit的short表示,其值是拼音表的索引号
unsigned short len1 = GetUShort(fData,startPos + pos);
pos += 2;
CHanzi hanzi;
//取拼音索引信息
hanzi.SetPy(fData.mid(startPos + pos,len1));
pos += len1;
//索引之后是词组的长度,字节数,字数乘
unsigned short len2 = GetUShort(fData,startPos + pos);
pos += 2;
//Unicode编码,每个汉字bit
hanzi.SetHz(fData.mid(startPos + pos,len2));
hzList.push_back(hanzi);
pos += len2;
//到下个字的偏移位置,没有再细分析,可能是词频或者类似信息
pos += (12 + offset * (12 + len2 + 2));
if (pos + startPos >= fData.length())
{
break;
}
}
}
//3、读取名称、类别、信息、示例
inline void ReadFileInfo(QByteArray &fData)
{
QByteArray fFlag = fData.mid(0,8);
if(fFlag.data() == "/x40/x15/x00/x00/x44/x43/x53/x01")
{
qDebug() << "确认你选择的是搜狗(.scel)词库?";
return ;
}
QString strName;
strName = GetStrvalue(fData,0x130,128);
qDebug() << "词库名:" << strName;
strName = GetStrvalue(fData,0x338,128);
qDebug() << "词库类型:" << strName;
strName = GetStrvalue(fData,0x540,128);
qDebug() << "描述信息:" << strName;
strName = GetStrvalue(fData,0xd40,128);
qDebug() << "词库示例:" << strName;
}