这些代码已经stress test, 但是因为set_list 函数还有待改进,所以代码中insert_word还会占用很长的时间。会进一步改进。算法来源于a efficient implementation of trie structures. 作者貌似是个日本人。先贴上代码,以便供喜欢double array trie的人研究,看了libdatrie的源码,比这个复杂,但是 原理是一样的。近期会给出原文章的翻译。http://blog.csdn.net/zzran/article/details/8462002
#include<iostream> #include<string> using namespace std; #define MIN_CODE 1 #define MAX_CODE 255 #define BC_INC 10 #define TAIL_INC 10 #define TEMP_INC 5 #define CHAR_NUM 26 int *BC; char *TAIL; char *TEMP; int BC_POS; int TAIL_POS; int BC_MAX; int TAIL_MAX; int TEMP_MAX; void realloc_bc(); void separate(int s, char *b, int tail_pos); int change_bc(int current, int s, char *list, char ch); int base(int n) { if(n > BC_POS) { return 0; } else { cout << "read base index=" << n << ":value="<< BC[2 * n] << endl; return BC[2 * n]; } } int check(int n) { if(n > BC_POS) { return 0; } else { cout << "read check index=" << n << ":value="<< BC[2 * n + 1] << endl; return BC[2 * n + 1]; } } void w_base(int n, int node) { while(n >= BC_MAX) { realloc_bc(); } if(n > BC_POS) { BC_POS = n; } BC[2 * n] = node; cout << "write base index=" << n << ":value="<< BC[2 * n] << endl; } void w_check(int n, int node) { while(n >= BC_MAX) { realloc_bc(); } if(n > BC_POS) { BC_POS = n; } BC[2 * n + 1] = node; cout << "write check index=" << n << ":value="<< BC[2 * n + 1] << endl; } char *mem_str(char *area_name, int *max, int init) { *max = init; char *area = (char*)malloc(sizeof(char) * (*max)); if(area == NULL) { cout << area_name << " malloc error!" << endl; } memset(area, *max, '\0'); return area; } int arc_index(char ch) { return ch - 'a' + 2; } void realloc_bc() { int i, pre_bc; pre_bc = BC_MAX; BC_MAX += BC_INC; BC =(int*)realloc(BC, sizeof(int) * 2 * BC_MAX); if(BC == NULL) { cout << "realloc bc error!" << endl; return; } for(i = 2 * pre_bc; i < 2 * BC_MAX; i++) { BC[i] = 0; } cout << "realloc bc!" << endl; } char *realloc_str(char *area_name, char *area, int *max, int inc) { int pre_size; int i; pre_size = *max; *max += inc; area = (char*) realloc(area, sizeof(char) * (*max)); if(area == NULL) { cout << area_name << " realloc error!" << endl; exit(-1); } for(i = pre_size; i < *max; i++) { area[i] = '\0'; } cout << area_name << " realloc ok!" << endl; return area; } void read_tail(int p) { int i = 0; while(TAIL[p] != '#') TEMP[i++] = TAIL[p++]; TEMP[i++] = '#'; TEMP[i] = '\0'; cout << "read tail!" << endl; } void write_tail(char *temp, int p) { int i = 0; int tail_index; tail_index = p; while((p + strlen(temp)) >= TAIL_MAX - 1) { TAIL = realloc_str("TAIL", TAIL, &TAIL_MAX, TAIL_INC); } while(*(temp + i) != '\0') { TAIL[tail_index++] = *(temp + i); i++; } if(p + i + 1 > TAIL_POS) { TAIL_POS = p + i; } cout << "write tail!" << endl; } int x_check(char *list) { int i, base_pos = 1, check_pos; unsigned char ch; i = 0; cout << "x_check start:" << endl; do { ch = list[i++]; check_pos = base_pos + ch; //change if(check(check_pos) != 0) { base_pos++; i = 0; continue; } } while(list[i] != '\0'); cout << "x_check end!" << endl; return base_pos; } char *set_list(int s) { char *list = (char*)malloc(MAX_CODE + 1 + 1); // 256个字符 + 1 '\0' int i, j = 0, t; for(i = MIN_CODE; i < MAX_CODE; i++) { t = base(s) + i; if(check(t) == s) { list[j] = (unsigned char)i; //change j++; } } list[j] = '\0'; cout << "set_list:" << list << endl; return list; } void separate(int s, char *b, int tail_pos) { int t = base(s) + (unsigned char)(*b); // change b++; w_check(t, s); w_base(t, (-1) * tail_pos); write_tail(b, tail_pos); } void bc_insert(int s, char *b) { int t; char list_s[MAX_CODE + 2]; char list_t[MAX_CODE + 2]; cout << "bc_insert start:" << endl; t = base(s) + (unsigned char)(*b); // change cout << "t=" << t << " check(t)=" << check(t) << endl; if(check(t) != 0) { strcpy(list_s, set_list(s)); strcpy(list_t, set_list(check(t))); if(strlen(list_s) + 1 < strlen(list_t)) { cout << "list_s=" << list_s << endl; s = change_bc(s, s, list_s, *b); } else { cout << "list_t=" << list_t << endl; s = change_bc(s, check(t), list_t, '\0'); } } separate(s, b, TAIL_POS); cout << "bc_insert end." << endl; } int change_bc(int current, int s, char *list, char ch) { int i, k, old_node, new_node, old_base; char a_list [MAX_CODE + 2]; old_base = base(s); if(ch != '\0') { strcpy(a_list, list); i = strlen(a_list); a_list[i] = ch; a_list[i + 1] = '\0'; } else { strcpy(a_list, list); } w_base(s, x_check(a_list)); i = 0; do { old_node = old_base + (unsigned char)(*list); //change new_node = base(s) + (unsigned char)(*list); cout << "old_node=" << old_node << ",new_node=" << new_node << endl; w_base(new_node, base(old_node)); w_check(new_node, s); if(base(old_node) > 0) { k = base(old_node) + 1; while(k - base(old_node) <= MAX_CODE || k < BC_POS) { if(check(k) == old_node) { w_check(k, new_node); } ++k; } } if(current != s && old_node == current) { current = new_node; } w_base(old_node, 0); w_check(old_node, 0); list++; } while(*list != '\0'); return current; } void tail_insert(int s, char *a, char *b) { char list[3]; unsigned char ch; int i = 0; int length = 0; int t; int old_tail_pos; old_tail_pos = (-1) * base(s); cout << "tail_insert:" << "s=" << s << "a=" << a << " b=" << b << endl; while(a[length] == b[length]) length++; while(i < length) { ch = a[i++]; list[0] = ch; list[1] = '\0'; w_base(s, x_check(list)); t = base(s) + (unsigned char)(ch); w_check(t, s); s = t; } list[0] = a[length]; list[1] = b[length]; list[2] = '\0'; w_base(s, x_check(list)); separate(s, a + length, old_tail_pos); separate(s, b + length, TAIL_POS); } int search_word(char *p_word) { // if found word, return its base index, if not, return -1 unsigned char ch; int h = -1; int s = 1; int t; cout << "begin-search word: " << p_word << endl; do { ++h; ch = p_word[h]; t = base(s) + (unsigned char)(ch); if(check(t) != s) { cout << "end-search word:" << p_word << endl; return -1; } if(base(t) < 0) { break; } s = t; } while(*(p_word + h)); if(p_word[h] != '#') read_tail((-1) * base(t)); if(p_word[h] == '#' || strcmp(TEMP, p_word + h + 1) == 0) { cout << "end-search word: " << p_word << endl; return t; } else { cout << "end-search word: " << p_word << endl; return -1; } } int delete_word(char *p_word) { // if delete given word, return 1, else return 0 int t = search_word(p_word); if(t = -1) { return 0; } else { w_base(t, 0); w_check(t, 0); return 1; } } int insert_word(char *p_word) { unsigned char ch; int h = -1; int s = 1; int t; cout << "begin-insert word :" << p_word << endl; strcat(p_word, "#"); do { ++h; ch = *(p_word + h); t = base(s) + (unsigned char)(ch); if(check(t) != s) { cout << "s=" << s << ",t=" << t << ",check(t)=" << check(t) << endl; bc_insert(s, p_word + h); cout << "end-insert word:" << p_word << endl; return 1; } if(base(t) < 0) { break; } s = t; } while(*(p_word + h)); if(p_word[h] != '#') read_tail((-1) * base(t)); if(p_word[h] == '#' || strcmp(TEMP, p_word + h + 1) == 0) { return 1; } if(base(t) != 0) { tail_insert(t, TEMP, p_word + h + 1); cout << "end-insert word :" << p_word << endl; } return 1; } void initialize() { BC_MAX = BC_INC; BC_POS = 1; TAIL_POS = 1; BC = (int*)malloc(sizeof(int) * 2 * BC_MAX); if(BC == NULL) { cout << "BC malloc error!" << endl; return; } for(int i = 0; i < 2 * BC_MAX; i++) { BC[i] = 0; } w_base(1, 1); BC_POS = 1; TAIL = mem_str("TAIL", &TAIL_MAX, TAIL_INC); TAIL[0] = '#'; TEMP = mem_str("TEMP", &TEMP_MAX, TEMP_INC); } void main() { char word[30] = {'\0'}; initialize(); FILE *key_file = fopen("key_words.txt", "r"); if(key_file == NULL) { cout << "open key file error!" << endl; return ; } while(fscanf(key_file, "%s", word) != EOF) { insert_word(word); cout << endl; } strcpy(word, "Beijing#"); if(search_word(word) > 0) { printf("find word!\n"); } else { printf("not find word!\n"); } }