搜狗细胞词库处理代码(可用于scel转txt)

搜狗细胞词库处理代码(可用于scel转txt)

2010-07-05 18:46

http://hi.baidu.com/guangbinw/blog/item/86d61009de07f28fd0581b11.html

今天先贴个简单代码,稍后再详细叙述…… 

基于QT实现,主要是考虑Unicode字符处理的方便,

可以稍加处理用于C或C++语言。

 

// 取连续两字节,转换为short类型的值,字节顺序是低字节-高字节

inline unsigned short GetUShort(QByteArray &fData, int startPos)

{

unsigned char low = fData.at(startPos);

unsigned char high = fData.at(startPos + 1);

unsigned short st = low + (high * 256);

 

return st;

}

 

//把 Unicode编码的值转换为字符串

inline QString GetStrvalue(QByteArray &fData, int startPos, int len)

{

QString temp;

 

for (int i = 0 ;i < len ; i+=2)

{

unsigned short st = GetUShort(fData,startPos + i);

QChar ch = QChar(st);

 

temp.append(ch);

if (st == 0)

{

break;

}

}

 

return temp;

}

 

//临时保存结果

class CHanzi

{

public:

void SetPy(QByteArray &py)

{

m_py = py;

}

 

void SetHz(QByteArray &hz)

{

m_hz = GetStrvalue(hz,0,hz.size());

}

 

void Debug()

{

QString py = "";

 

for (int i = 0 ; i < m_py.length() ; i += 2)

{

py += pyList.at(GetUShort(m_py,i));

}

 

qDebug() << m_hz << ":" << py;

}

 

private:

QByteArray m_py;

QString m_hz;

};

 

//1、读取拼音表

inline void ReadPyTable(QByteArray &fData,QList<QString> &pyList)

{

int startPos = 0x1540;

 

QByteArray fFlag = fData.mid(startPos,4);

if( fData.data() == "/x9D/x01/x00/x00")

{

qDebug() << "读取词库拼音表失败!";

return;

}

 

int pos = 4;

while (true)

{

//取bit的索引号,用于表示一个拼音信息

unsigned short n = GetUShort(fData,startPos + pos);

pos += 2;

 

//取bit的拼音长度,字母数的倍

unsigned short len = GetUShort(fData,startPos + pos);

pos += 2;

 

//每个字母占bit,Unicode编码

QString py = GetStrvalue(fData,startPos + pos,len);

qDebug() << n << ":" << py;

pyList.push_back(py);

pos += len;

 

//zuo是最后一个拼音,处理完成

if (py == "zuo" || pos + startPos >= fData.length())

{

break;

}

}

}

 

//2、读取汉字表

inline void ReadHzTable(QByteArray &fData,QList<CHanzi> &hzList)

{

int startPos = 0x2628;

int pos = 0;

 

while (true)

{

//偏移加权,16bit,用于计算下一记录的位置

unsigned short offset = GetUShort(fData,startPos + pos) - 1;

pos += 2;

 

//拼音长度,16bit,每个拼音用一个bit的short表示,其值是拼音表的索引号

unsigned short len1 = GetUShort(fData,startPos + pos);

pos += 2;

 

CHanzi hanzi;

//取拼音索引信息

hanzi.SetPy(fData.mid(startPos + pos,len1));

pos += len1;

 

//索引之后是词组的长度,字节数,字数乘

unsigned short len2 = GetUShort(fData,startPos + pos);

pos += 2;

 

//Unicode编码,每个汉字bit

hanzi.SetHz(fData.mid(startPos + pos,len2));

hzList.push_back(hanzi);

pos += len2;

 

//到下个字的偏移位置,没有再细分析,可能是词频或者类似信息

pos += (12 + offset * (12 + len2 + 2));

if (pos + startPos >= fData.length())

{

break;

}

}

}

 

//3、读取名称、类别、信息、示例

inline void ReadFileInfo(QByteArray &fData)

{

QByteArray fFlag = fData.mid(0,8);

if(fFlag.data() == "/x40/x15/x00/x00/x44/x43/x53/x01")

{

qDebug() << "确认你选择的是搜狗(.scel)词库?";

return ;

}

 

QString strName;

strName = GetStrvalue(fData,0x130,128);

qDebug() << "词库名:" << strName;

 

strName = GetStrvalue(fData,0x338,128);

qDebug() << "词库类型:" << strName;

strName = GetStrvalue(fData,0x540,128);

qDebug() << "描述信息:" << strName;

strName = GetStrvalue(fData,0xd40,128);

qDebug() << "词库示例:" << strName;

}

你可能感兴趣的:(c,Class,语言,qt,搜狗,2010)