制作特定书单词手册

前段时间,看Dan Simons的海伯利安系列,但是到Endmion时,海峡两岸均未引进。没办法,就先下个Sample看,看了四章,觉得很好看。后来,又找了个pdf格式的全本,不认识的单词实在太多,就想制份单词表。下面是步骤:

1.先要从文件里提取出单词。这里有一个简单的C程序可以完成这个任务,代码如下:

 
/*
*wordCount.h
*/
                                                 
#ifndef _ZPZ_TRIE_H
#define _ZPZ_TRIE_H
#include 
#include 
                                                 
typedef int (*trie_char_to_index)(charc);
typedef char (*trie_index_to_char)(inti);
                                                 
typedef struct _trie_node{
    intcount;
    intdepth;
    struct_trie_node ** list;
} trie_node;
                                                 
typedef struct _trie_tree{
    intchar_width;
    intmax_depth;
    trie_node * root;
    trie_char_to_index char_to_index;
    trie_index_to_char index_to_char;
} trie_tree;
                                                 
inlinetrie_node * trie_node_new(intdepth){
    trie_node * node = (trie_node *)malloc(sizeof(trie_node));
    if(NULL != node){
        memset(node, 0,sizeof(trie_node));
        node->depth = depth;
    }
    returnnode;
}
                                                 
static inline trie_node ** trie_node_alloc_children(intchar_width){
    trie_node ** nodes = (trie_node **)malloc(char_width *sizeof(trie_node *));
    if(NULL != nodes){
        memset(nodes, 0, char_width *sizeof(trie_node *));
    }
    return nodes;
}
                                                 
inline trie_tree * trie_init(
    trie_tree * tree,
    intchar_width,
    trie_char_to_index char_to_index,
    trie_index_to_char index_to_char
){
    memset(tree, 0,sizeof(trie_tree));
    tree->char_to_index = char_to_index;
    tree->index_to_char = index_to_char;
    tree->root = trie_node_new(0);
    tree->char_width = char_width;
    if(NULL != tree->root){
        tree->root->list = trie_node_alloc_children(tree->char_width);
    }
}
                                                 
int trie_add(trie_tree * tree, char* word, int len){
    char* p = word, * end = p + len;
    trie_node * trie =  tree->root;
    int index, depth = 0;
    if(NULL == p || end == p){
        return0;
    }
    while(p < end){
        index = tree->char_to_index(*p);
        if(NULL == trie->list){
            trie->list = trie_node_alloc_children(tree->char_width);
        }
        if(NULL == trie->list){
            return0;
        }
        depth++;
        if(NULL == trie->list[index]){
            trie->list[index] = trie_node_new(depth);
        }
        if(NULL == trie->list[index]){
            return0;
        }
        trie = trie->list[index];
        p++;
    }
    if(depth > tree->max_depth + 1){
        tree->max_depth = depth - 1;
    }
    trie->count++;
    return 1;
}
                                                 
#define trie_add_ex(tree,word) trie_add((tree), (word), strlen(word))
                                                 
static int trie_node_release(trie_node * node,int char_width){
    int i;
    if(NULL != node->list){
        for(i = 0; i < char_width; i++){
            if(NULL != node->list[i]){
                trie_node_release(node->list[i], char_width);
            }
        }
        free(node->list);
    }
    free(node);
}
                                                 
#define trie_release(tree)  \
    trie_node_release((tree)->root, (tree)->char_width);\
                                                 
trie_node * trie_find(trie_tree * tree,char * word, int len){
    intindex;
    char* p = word, * end = p + len;
    trie_node * trie =  tree->root;
    if(NULL == p || p == end){
        return NULL;
    }
    while('\0'!= *p){
        if(NULL == trie->list){
            return NULL;
        }
        index = tree->char_to_index(*p);
        if(NULL == trie->list[index]){
            return NULL;
        }
        trie = trie->list[index];
        p++;
    }
    return trie;
                                                 
}
#define trie_find_ex(tree,word) trie_find((tree),(word),strlen(word))
                                                 
static inline void trie_node_dump(
    trie_tree * tree,
    trie_node * node,
    char* word
){
    int i = 0;
    if(node->count > 0){
        printf("%-8d %s\n", node->count, word);
    }
    if(NULL != node->list){
        for(; i < tree->char_width; i++){
            if(NULL != node->list[i]){
                word[node->depth] = tree->index_to_char(i);
                word[node->depth + 1] ='\0';
                trie_node_dump(tree, node->list[i], word);
            }
        }
    }
}
                                                 
void trie_dump(trie_tree * tree){
    char* word = (char*)malloc(tree->max_depth + 1);
    memset(word, 0, tree->max_depth + 1);
    trie_node_dump(tree, tree->root, word);
}
                                                 
static inline int char_to_index(char c){
    if('0'<= c && c <= '9'){
        return c - '0';
    }
    elseif('a'<= c && c <= 'z'){
        return 10 + c - 'a';
    }
    elseif('A'<= c && c <= 'Z'){
        return 10 + 26 + c - 'A';
    }
    return 10 + 26 + 26;
}
                                                 
static inline char index_to_char(int index){
    if(0 <= index && index < 10){
        return'0' + index;
    }
    else if(10 <= index && index < (10 + 26)){
        return 'a' + (index - 10);
    }
    else if((10 + 26) <= index && index < (10 + 26 + 26)){
        return 'A' + (index - 10 - 26);
    }
    return' ';
}
                                                 
inline trie_init_alnum(trie_tree * tree){
    trie_init(tree, 26 + 26 + 10 + 1, char_to_index, index_to_char);
}
                                                 
#endif

然后是一个具体实现代码:


 
/*
 * =====================================================================================
 *
 *       Filename:  countWord.c
 *
 *    Description:
 *
 *        Version:  1.0
 *        Created:  2013年01月04日 07时41分15秒
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  YOUR NAME (),
 *        Company:
 *
 * =====================================================================================
 */
#include "countWord.h"
                                              
typedef void (*word_handler)(char* start, char* end);
                                              
#define is_digit_or_letter(c) (((c) >='a' && (c) <= 'z') || ((c) >= '0' && (c) <= '9') || ((c) >= 'A' && (c) <= 'Z'))
                                              
void find_word_in_buffer(
    char* start,
    int len,
    word_handler handler
){
    int stat = 0;
    char* end = start + len, * last = start;
                                              
    while(start < end){
        //printf("%c", *start);
        if(is_digit_or_letter(*start)){
            if(0 == stat){
                last = start;
                stat = 1;
            }
        }
        else{
            if(1 == stat){
                //word
                handler(last, start);
                stat = 0;
            }
        }
        start++;
    }
    if(1 == stat && last < start){
        handler(last, start);
    }
}
                                              
void count_words_of_file(
    FILE* file,
    word_handler handler
){
    int line = 1,  words = 1;
    long file_size = 0;
    char* buffer;
                                              
    fseek(file, 0 , SEEK_END);
    file_size =ftell(file);
    rewind(file);
                                              
    buffer = (char*)malloc(file_size);
                                              
    if(NULL == buffer){
        printf("Alloc failed");
        return;
    }
                                              
    fread(buffer, file_size, 1, file);
    find_word_in_buffer(buffer, file_size, handler);
}
                                              
static trie_tree tree;
                                              
void count_words_handler(char* begin, char* end){
    trie_add(&tree, begin, end - begin);
}
                                              
int main(int argc, char ** argv){
    if(argc < 2){
        printf("parameter 2 must be the file name.\n");
        return1;
    }
    FILE* file = fopen(argv[1],"r");
    if(NULL == file){
        printf("File %s not exitst.", argv[1]);
        return2;
    }
    trie_init_alnum(&tree);
    count_words_of_file(file, count_words_handler);
    fclose(file);
    trie_dump(&tree);
    trie_release(&tree);
    return0;
}

然后编译就可以用了。

2.获得书中的单词

1
. /a .out Endmion.txt > wordsOfEndmion.csv

其中a.out是编译的可执行程序,Endmion.txt是小说文件,而wordsOfEndmion.csv是小说中所有出现的单词


3.排个序

 
sort -n wordsOfEndmion.csv > wordsOfEndmion_sort.csv

排序的作用是把那些高频词筛选出来,因为那些高频词都是认识的。我是把频率60以上的基本都删除了。其余的就是一些出现频率小的词,里面有太多不认识的。我们的目标就是这些词。

4.整理词典文件

现在这个 词典文件的基本格式是 "1 abort"这样的,为了获得翻译结果,需要把前面的这个频率数字去除。用任何 方法均可,我是用的Gvim中的块操作。操作完成后,文件的格式就是每行一个单词了。

5.获得单词释义

在Linux中,有一个命令行词典非常好用,其名字叫 sdcv ,是stardict的一个命令行实现。我们要获得释义,就靠它了。写一个Shell脚本:

 
#!/bin/bash#name getMean.sh
# Usage: $0 >$0.mean   # 输入所读入的行本身.
        sdcv -n"$line">>$0.mean
done
                   
exit0


非常简单,就是读取每一行,然后把单词和释义写入到另一个文件中。用法示例:

 
. /getMean .sh < wordsOfEndmion_sort.csv

把前一步整理好的文件用< 符号传送给脚本 文件即可。会在脚本目录生成一个名为"getMean.sh.eman"的文件,其格式类似于:


Soya
Found 1 items, similar to Soya.
-->朗道英汉字典5.0
-->soya

【医】 大豆

Aenea
Found 10 items, similar to Aenea.
-->朗道英汉字典5.0
-->aedea

【医】 外生殖器

-->朗道英汉字典5.0
-->Aegean

*[i:'dʒi:әn]
a. 爱琴海的
n. 爱琴海


好,大功告成。现在,把其扩展名修改为txt,然后发送给Amazon,一份专属于Endmion的词典就制作 成功了。可以随时在Kindle上观看了。

你可能感兴趣的:(life)