//细胞词库解码C源程序,输出为UNICODE的TXT格式
说明: sogo输入法细胞词库解析源程序dis_sogo_cell.c,将.scel文件解码输出unicode格式的文本文件,可以提取出大量的中文词语,输出拼音及其中文词语,每一条记录一行, 每条记录的拼音和中文词语用分号分隔,同音词之间以逗号分隔,格式如下所示: bei di ;北地,北堤, bei gou ;北沟,杯勾, cai feng xiao qu ;彩俸小区,彩凤小区,彩风小区, chang cheng shu dian ;长城书店, chang ge zhuang cun ;常各庄村, chang he da sha ;长和大厦,长河大厦, 使用方法(Linux下): 1.编译:gcc dis_sogo_cell.c -o dis_sogo_cell 或直接make 2.使用:./dis_sogo_cell sogo_scel_file.scel > sogo_scel_file.scel.txt 3.用shell脚本批量处理,将当全目录下的.scel文件转换为TXT文件: #!/bin/bash for scel_file in `ls *.scel` do ./dis_sogo_cell ${scel_file} > ${scel_file}.txt done 4.使用程序包内提供的cell2txt.sh,在shell下直接执行即可将当前目录下的所有.scel文件提取为文本文件(UNICODE格式的)。如: # ./cell2txt.sh 5.如需将生成的unicode 的文本文件转为ANSI编码的TXT文件,可以利用Linux的iconv命令,如: # iconv -futf-16 -tGB18030 sogo_scel_file.scel.txt -o sogo_scel_file.scel_ANSI.txt 转换编码后的文件sogo_scel_file.scel_ANSI.txt将比原来的sogo_scel_file.scel.txt文件的字节数减小约50%,可极大节省存储空间,同时便于使用不支持UNICODE的 文本编辑器查看输出的结果。C代码如下:
/*
* dis_sogo_cell.c
*
* snallieATtomDOTcom
* Sun Nov 7 06:30:00 CST 2014
*
* decoding sogo .scel file,
* output Chinese PinYin string and Chinese word in unicode
*
* Example of output data:
* bei di ;北地,北堤,
* bei gou ;北沟,杯勾,
* cai feng xiao qu ;彩俸小区,彩凤小区,彩风小区,
* chang cheng shu dian ;长城书店,
* chang ge zhuang cun ;常各庄村,
* chang he da sha ;长和大厦,长河大厦,
*
* to make under Linux: # gcc dis_sogo_cell.c -o dis_sogo_cell
* usage : ./dis_sogo_cell sogo_scel_file.scel > sogo_scel_file.scel.txt
*
* !! sogo_scel_file.scel.txt is a TXT file in unicode !!
*
* to invoke in shell script:
* for scel_file in `ls *.scel` ; do ./dis_sogo_cell ${scel_file} > ${scel_file}.txt ; done
*
*/
/*
* 搜狗的scel词库就是保存的文本的unicode编码,每两个字节一个字符(中文汉字或者英文字母)
* 找出其每部分的偏移位置即可
* 主要两部分
* 1.全局拼音表,貌似是所有的拼音组合,字典序
* 格式为(index,len,pinyin)的列表
* index: 两个字节的整数 代表这个拼音的索引
* len: 两个字节的整数 拼音的字节长度
* pinyin: 当前的拼音,每个字符两个字节,总长len
*
* 2.汉语词组表
* 格式为(same_pronounce_num,py_table_len,py_table,{word_len,word,ext_len,ext})的一个列表
* same_pronounce_num: 两个字节 整数 同音词数量
* py_table_len: 两个字节 整数
* py_table: 整数列表,每个整数两个字节,每个整数代表一个拼音的索引
*
* word_len:两个字节 整数 代表中文词组字节数长度
* word: 中文词组,每个中文汉字两个字节,总长度word_len
* ext_len: 两个字节 整数 代表扩展信息的长度,好像都是10
* ext: 扩展信息 前两个字节是一个整数(不知道是不是词频) 后八个字节全是0
*
* {word_len,word,ext_len,ext} 一共重复same_pronounce_num次 同音词 相同拼音表
*/
#include
#include
#include
#include
#include
extern int errno;
typedef enum bool_t { false, true } bool;
#include
/*
int8_t = 1, uint8_t = 1
int16_t = 2, uint16_t = 2
int32_t = 4, uint32_t = 4
int64_t = 8, uint64_t = 8
int_least8_t = 1, uint_least8_t = 1
int_least16_t = 2, uint_least16_t = 2
int_least32_t = 4, uint_least32_t = 4
int_least64_t = 8, uint_least64_t = 8
int_fast8_t = 1, uint_fast8_t = 1
int_fast16_t = 4, uint_fast16_t = 4
int_fast32_t = 4, uint_fast32_t = 4
int_fast64_t = 8, uint_fast64_t = 8
*/
int start_PY = 0x1540;
int startChinese = 0x2628; // 0x26c4
int count_py;
int py_cel_idx = 0;
typedef struct py_t {
int16_t index;
int16_t len;
char pinyin[30];
} py_tab;
py_tab cel_py_tab[0x280];
FILE *in_file;
extern int errno;
int16_t index3;
int16_t len;
char pinyin[30];
void print_unicode_crlr()
{
printf("%c%c", 0x0d, '\0');
printf("%c%c", 0x0a, '\0');
}
void print_unicode_space()
{
printf("%c%c", ' ', '\0');
}
void print_ascii_in_unicode(unsigned char a)
{
printf("%c%c", a & 0x7f, '\0');
}
void print_ascii_str_in_unicode(unsigned char *ascii_str)
{
int i = 0;
while (ascii_str[i]) {
print_ascii_in_unicode(ascii_str[i]);
i++;
}
}
void print_py(int idx)
{
int i;
for (i = 0; i < cel_py_tab[idx % count_py].len; i++) {
printf("%c", cel_py_tab[idx % count_py].pinyin[i]);
}
}
void print_char2(unsigned char *start_pos, int count, bool update_py_tab)
{
int i;
for (i = 0; i < count; i++) {
printf("%c", start_pos[i]);
}
}
void put_py_tab(unsigned char *start_pos, int count)
{
int i;
for (i = 0; i < count; i++) {
cel_py_tab[py_cel_idx].pinyin[i % 30] = start_pos[i];
}
}
int read_py_item(int pos)
{
if (fseek(in_file, pos, SEEK_SET) == 0) {
fread(&index3, 1, 2, in_file);
fread(&len, 1, 2, in_file);
if (len > 0) {
fread(pinyin, 1, len, in_file);
cel_py_tab[py_cel_idx].index = index3;
cel_py_tab[py_cel_idx].len = len;
put_py_tab(pinyin, len);
py_cel_idx++;
return 2 + 2 + len;
} else {
return index3;
}
} else {
printf("Seek error\n");
exit(1);
}
}
int main(int argc, char **argv)
{
unsigned char header[12];
unsigned char header_magic[13] =
"\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00";
char scel_type = 0;
int next_pos = 4;
int i;
if (argc < 2) {
printf("Usage: %s acel_l_file\n", argv[0]);
return;
} else {
in_file = fopen(argv[1], "rb");
if (!in_file) {
fprintf(stderr, "Can't open input file '%s', %s\n",
argv[1], strerror(errno));
exit(0);
} else {
fread(header, 1, 12, in_file); // read header magic word
scel_type = header[4];
header[4] = 0x44;
if (memcmp(header, header_magic, 12) != 0) {
printf("Not a .scel file, quit!\n");
exit(1);
} else {
switch (scel_type) {
case 0x44: // 'D'
startChinese = 0x2628;
break;
case 0x45: // 'E'
startChinese = 0x26c4;
break;
default:
{
printf(".scel file corrupted, quit!\n");
exit(1);
}
break;
}
}
rewind(in_file);
printf("%c%c", 0xff, 0xfe); // unicode-8 HEADER
//#define DEBUG
#define OUT_HEADER
#ifdef OUT_HEADER
// display file header info
{
unsigned char title[0x338 - 0x130];
unsigned char type[0x540 - 0x338];
unsigned char desc[0xd40 - 0x540];
unsigned char samples[0x1540 - 0xd40];
print_ascii_str_in_unicode("Name: ");
fseek(in_file, 0X130, SEEK_SET);
if (fread(title, 1, 0x338 - 0x130, in_file) !=
(0x338 - 0x130)) {
perror("fread");
exit(1);
}
print_char2(title, 0x338 - 0x130, false);
print_unicode_crlr();
print_ascii_str_in_unicode("Type: ");
fseek(in_file, 0x338, SEEK_SET);
if (fread(type, 1, 0x540 - 0x338, in_file) !=
(0x540 - 0x338)) {
perror("fread");
exit(1);
}
print_char2(type, 0x540 - 0x338, false);
print_unicode_crlr();
print_ascii_str_in_unicode("Desc: ");
fseek(in_file, 0x540, SEEK_SET);
if (fread(desc, 1, 0xd40 - 0x540, in_file) !=
(0xd40 - 0x540)) {
perror("fread");
exit(1);
}
print_char2(desc, 0xd40 - 0x540, false);
print_unicode_crlr();
print_ascii_str_in_unicode("Smpl: ");
fseek(in_file, 0xd40, SEEK_SET);
if (fread(samples, 1, 0x1540 - 0xd40, in_file) !=
(0x1540 - 0xd40)) {
perror("fread");
exit(1);
}
print_char2(samples, 0x1540 - 0xd40, false);
print_unicode_crlr();
print_unicode_crlr();
}
rewind(in_file);
#endif
count_py = read_py_item(start_PY);
for (i = 0; i < count_py; i++) {
next_pos += read_py_item(start_PY + next_pos);
}
#ifdef DEBUG
{
int i;
unsigned char str_tmp[200];
for (i = 0; i < count_py; i++) {
sprintf(str_tmp, "%03d(0x%03x):", i, i);
print_ascii_str_in_unicode(str_tmp);
print_py(i);
print_unicode_crlr();
}
}
#endif
{
uint16_t same_pronounce_num;
uint16_t py_table_len;
uint16_t py_table[150];
uint16_t word_len;
uint8_t word[150];
unsigned char str_tmp[200];
int cur_fptr;
int file_size;
int i = 0;
int j;
int next_pos_py = 0;
fseek(in_file, 0x0L, SEEK_END);
file_size = ftell(in_file);
#ifdef DEBUG
sprintf(str_tmp, "\r\file_size:%d\xd\xa", file_size);
print_ascii_str_in_unicode(str_tmp);
#endif
fseek(in_file, startChinese, SEEK_SET);
while (ftell(in_file) < file_size) {
cur_fptr = ftell(in_file);
#ifdef DEBUG
sprintf(str_tmp, "\xd\xacur_fptr:%d(0x%08x)\xd\xa",
cur_fptr, cur_fptr);
print_ascii_str_in_unicode(str_tmp);
#endif
fread(&same_pronounce_num, 1,
sizeof same_pronounce_num, in_file);
fread(&py_table_len, 1, sizeof py_table_len, in_file);
if (py_table_len > sizeof(py_table)) {
sprintf(str_tmp,
"\xd\xaError, .scel file maybe corrupt: too big size of py_table:%d(0x%08x), at file:0x%x\xd\xa",
py_table_len, py_table_len,
ftell(in_file));
print_ascii_str_in_unicode(str_tmp);
break;
} else {
fread(py_table, 1, py_table_len, in_file);
}
if (same_pronounce_num == 0 || same_pronounce_num > 20) {
//#define ERR_OUTPUT
#ifdef ERR_OUTPUT
sprintf(str_tmp,
"\xd\xaError, improper SAME_PRONOUNCE_NUM item size:%d(0x%08x), at file:0x%x\xd\xa",
same_pronounce_num, same_pronounce_num,
ftell(in_file));
print_ascii_str_in_unicode(str_tmp);
#endif
break;
}
#ifdef DEBUG
sprintf(str_tmp, "same_pronounce_num:%d\xd\xa",
same_pronounce_num);
print_ascii_str_in_unicode(str_tmp);
sprintf(str_tmp, "py_table_len:%d\xd\xa",
py_table_len);
print_ascii_str_in_unicode(str_tmp);
sprintf(str_tmp, "py_table:%d\xd\xa", py_table[0]);
print_ascii_str_in_unicode(str_tmp);
#endif
// print PY string , e.g. "Zuo You Wei Nan"
for (i = 0; i < py_table_len / 2; i++) {
print_char2(cel_py_tab[py_table[i] % count_py].
pinyin,
cel_py_tab[py_table[i] % count_py].len,
false);
print_unicode_space();
}
print_ascii_in_unicode(';');
for (j = 0; j < same_pronounce_num; j++) {
fread(&word_len, 1, sizeof word_len, in_file);
fread(word, 1, word_len + 12, in_file);
print_char2(word, word_len, false);
if (!(j + 1 == same_pronounce_num)) {
print_ascii_in_unicode(',');
}
}
print_unicode_crlr();
#ifdef DEBUG
{
long cur_fptr2 = ftell(in_file);
sprintf(str_tmp, "cur_fptr2:%d(0x%08x)\xd\xa",
cur_fptr2, cur_fptr2);
print_ascii_str_in_unicode(str_tmp);
}
#endif
}
}
}
}
}