本文使用压缩trie树实现字符串检索的功能。首先将字符串通过编码转化为二进制串,随后将二进制串插入到trie树中,在插入过程中同时实现压缩的功能。
字符编码采用Huffman,但最终测试发现不采用Huffman的方法不仅省下了编码时间,同时trie树的插入时间也有所减少。
1 /** 2 程序主函数与编码 3 */ 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <string.h> 7 #include "huffman.h" 8 #include "compress_trie.h" 9 //#include <time.h> 10 11 #define NUM_OF_HUFFMAN 81 12 #define LENGTH_OF_LINE 10000 13 #define RESULT_OF_HUFFMAN "result_of_HUFFMAN.dat" 14 //#define EMAIL "strpool.dat" 15 //#define CHECKED_EMAIL "checkedemail.dat" 16 #define RESULT "result.dat" 17 18 void str_to_bin(char buf[],char binary[],huffman_node hufm[]); 19 20 21 int main(int argc, char *argv[]) 22 { 23 //time_t time_start,time_end; 24 //time_start = time(NULL); 25 26 char* EMAIL = argv[1]; 27 char* CHECKED_EMAIL = argv[2]; 28 29 huffman_node hufm[NUM_OF_HUFFMAN]; 30 hufm_init(hufm,NUM_OF_HUFFMAN); 31 char buf[LENGTH_OF_LINE]; 32 char binary[LENGTH_OF_LINE]; 33 34 FILE* fin_of_huffman; 35 fin_of_huffman = fopen(RESULT_OF_HUFFMAN,"r"); 36 if(fin_of_huffman == NULL) 37 { 38 hufm_init(hufm,NUM_OF_HUFFMAN); 39 int i; 40 for(i=0;i<(NUM_OF_HUFFMAN+1)/2;i++) 41 { 42 hufm[i].num_of_ch = NUM_OF_HUFFMAN - i; 43 } 44 huffman_coding(hufm,NUM_OF_HUFFMAN); 45 } 46 else 47 { 48 char temp_char; 49 int i; 50 for(i=0;i<(NUM_OF_HUFFMAN+1)/2;i++) 51 { 52 fgets(buf,sizeof(buf),fin_of_huffman); 53 sscanf(buf,"%c %d %s",&temp_char,&hufm[i].num_of_ch,hufm[i].code); 54 } 55 } 56 fclose(fin_of_huffman); 57 58 printf("building trie..."); 59 FILE* fin_of_email; 60 fin_of_email = fopen(EMAIL,"r"); 61 trie_node *root; 62 root = (trie_node*)malloc(sizeof(trie_node)); 63 trie_node_init(&root); 64 65 while(fgets(buf,sizeof(buf),fin_of_email)!=NULL) 66 { 67 str_to_bin(buf,binary,hufm); 68 trie_insert(&root,binary); 69 } 70 fclose(fin_of_email); 71 printf("\r"); 72 printf("build trie success.\n"); 73 74 FILE *fin_of_checked,*fout_of_result; 75 fin_of_checked = fopen(CHECKED_EMAIL,"r"); 76 fout_of_result = fopen(RESULT,"w"); 77 int num_yes = 0; 78 int num_no = 0; 79 while(fgets(buf,sizeof(buf),fin_of_checked)!=NULL) 80 { 81 str_to_bin(buf,binary,hufm); 82 if(trie_search(root,binary)) 83 { 84 fprintf(fout_of_result,"YES\n"); 85 num_yes++; 86 } 87 else 88 { 89 fprintf(fout_of_result,"NO\n"); 90 num_no++; 91 } 92 } 93 fprintf(fout_of_result,"num of YES is:%d\n",num_yes); 94 fprintf(fout_of_result,"num of NO is:%d\n",num_no); 95 printf("search success!\n"); 96 fclose(fin_of_checked); 97 fclose(fout_of_result); 98 //time_end = time(NULL); 99 //printf("用时:%.0lfs\n", difftime(time_end, time_start)); 100 return 0; 101 } 102 103 104 void str_to_bin(char buf[],char binary[],huffman_node hufm[]) 105 { 106 int i; 107 binary[0] = '\0'; 108 for(i=strlen(buf)-1;i>=0;i--) 109 { 110 if(buf[i]>='a' && buf[i]<='z') 111 { 112 strcat(binary,hufm[buf[i]-'a'].code); 113 } 114 else if(buf[i]>='A' && buf[i]<='Z') 115 { 116 strcat(binary,hufm[buf[i]-'A'].code); 117 } 118 else if(buf[i]>='0' && buf[i]<='9') 119 { 120 strcat(binary,hufm[26+buf[i]-'0'].code); 121 } 122 else if(buf[i]=='_') 123 { 124 strcat(binary,hufm[36].code); 125 } 126 else if(buf[i]=='-') 127 { 128 strcat(binary,hufm[37].code); 129 } 130 else if(buf[i]=='.') 131 { 132 strcat(binary,hufm[38].code); 133 } 134 else if(buf[i]=='@') 135 { 136 strcat(binary,hufm[39].code); 137 } 138 else 139 { 140 strcat(binary,hufm[40].code); 141 } 142 } 143 }
1 /** 2 完成trie树的插入,查找。 3 */ 4 5 typedef struct TRIE_NODE 6 { 7 char is_str; 8 unsigned short num_of_bit; 9 unsigned char* compress_of_bit; 10 struct TRIE_NODE *point_of_zero,*point_of_one; 11 }trie_node; 12 13 //long int temp_of_new = 0; 14 15 16 void trie_node_init(trie_node **root); 17 int trie_insert(trie_node **root,char* bit_of_insert); 18 int trie_search(trie_node *root,char* bit_of_insert); 19 void trie_delete(trie_node *root); 20 void compress(trie_node *root,char* bit_of_insert); 21 int compare_of_bit(trie_node *root,char* bit_of_insert); 22 void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop); 23 24 25 26 27 void trie_node_init(trie_node **root) 28 { 29 (*root)->is_str = (char)0; 30 (*root)->num_of_bit = 0; 31 (*root)->compress_of_bit = NULL; 32 (*root)->point_of_zero = NULL; 33 (*root)->point_of_one = NULL; 34 } 35 36 void compress(trie_node *root,char* bit_of_insert) 37 { 38 int i,j,len_of_insert; 39 len_of_insert = strlen(bit_of_insert); 40 root->num_of_bit = len_of_insert; 41 if(root->num_of_bit<=32) 42 { 43 int temp; 44 for(i=len_of_insert-1,j=0;i>=0;i--,j++) 45 { 46 if(bit_of_insert[i] == '0') 47 { 48 clearbit(temp,j); 49 } 50 else 51 { 52 setbit(temp,j); 53 } 54 } 55 root->compress_of_bit = (unsigned char*)temp; 56 } 57 else 58 { 59 root->compress_of_bit = (unsigned char*)malloc((len_of_insert%8)?(len_of_insert/8+1):(len_of_insert/8)); 60 for(i=len_of_insert-1,j=0;i>=0;i--,j++) 61 { 62 if(bit_of_insert[i] == '0') 63 { 64 clearbit(root->compress_of_bit[j/8],j%8); 65 } 66 else 67 { 68 setbit(root->compress_of_bit[j/8],j%8); 69 } 70 } 71 } 72 } 73 74 75 int trie_insert(trie_node **root,char* bit_of_insert) 76 { 77 int ret; 78 char bit_of_pop[10000]; 79 if(root == NULL) 80 { 81 ret = 0; 82 } 83 else 84 { 85 if((*root)->num_of_bit == 0) 86 { 87 if(!(*bit_of_insert)) 88 { 89 (*root)->is_str = (char)1; 90 ret = 1; 91 } 92 else 93 { 94 if((*root)->is_str == 0 95 && (*root)->point_of_zero == NULL 96 && (*root)->point_of_one == NULL) 97 { 98 compress((*root),bit_of_insert); 99 (*root)->is_str = (char)1; 100 ret = 1; 101 } 102 else 103 { 104 if(*bit_of_insert == '0') 105 { 106 if((*root)->point_of_zero == NULL) 107 { 108 (*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node)); 109 trie_node_init(&(*root)->point_of_zero); 110 //temp_of_new++; 111 } 112 ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+1); 113 } 114 else 115 { 116 if((*root)->point_of_one == NULL) 117 { 118 (*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node)); 119 trie_node_init(&(*root)->point_of_one); 120 //temp_of_new++; 121 } 122 ret = trie_insert(&(*root)->point_of_one,bit_of_insert+1); 123 } 124 } 125 } 126 } 127 else 128 { 129 int ans_of_compare = compare_of_bit((*root),bit_of_insert); 130 if(ans_of_compare == 0) 131 { 132 trie_node *father = (trie_node*)malloc(sizeof(trie_node)); 133 trie_node_init(&father); 134 //temp_of_new++; 135 pop_bit((*root),bit_of_pop,1); 136 if(bit_of_pop[0] == '0') 137 { 138 father->point_of_zero = (*root); 139 } 140 else 141 { 142 father->point_of_one = (*root); 143 } 144 if(!(*bit_of_insert)) 145 { 146 father->is_str = (char)1; 147 ret = 1; 148 } 149 else 150 { 151 if(*bit_of_insert == '0') 152 { 153 father->point_of_zero = (trie_node*)malloc(sizeof(trie_node)); 154 trie_node_init(&father->point_of_zero); 155 //temp_of_new++; 156 ret = trie_insert(&father->point_of_zero,bit_of_insert+1); 157 } 158 else 159 { 160 father->point_of_one = (trie_node*)malloc(sizeof(trie_node)); 161 trie_node_init(&father->point_of_one); 162 //temp_of_new++; 163 ret = trie_insert(&father->point_of_one,bit_of_insert+1); 164 } 165 } 166 (*root) = father; 167 } 168 else 169 { 170 if(ans_of_compare == (int)(*root)->num_of_bit 171 && ans_of_compare == strlen(bit_of_insert)) 172 { 173 (*root)->is_str = (char)1; 174 ret = 1; 175 } 176 else if(ans_of_compare == (int)(*root)->num_of_bit) 177 { 178 bit_of_insert += ans_of_compare; 179 if(*bit_of_insert == '0') 180 { 181 if((*root)->point_of_zero == NULL) 182 { 183 (*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node)); 184 trie_node_init(&(*root)->point_of_zero); 185 //temp_of_new++; 186 } 187 ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+1); 188 } 189 else 190 { 191 if((*root)->point_of_one == NULL) 192 { 193 (*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node)); 194 trie_node_init(&(*root)->point_of_one); 195 //temp_of_new++; 196 } 197 ret = trie_insert(&(*root)->point_of_one,bit_of_insert+1); 198 } 199 } 200 else if(ans_of_compare == strlen(bit_of_insert)) 201 { 202 trie_node *father = (trie_node*)malloc(sizeof(trie_node)); 203 trie_node_init(&father); 204 //temp_of_new++; 205 pop_bit((*root),bit_of_pop,ans_of_compare); 206 compress(father,bit_of_pop); 207 father->is_str = (char)1; 208 pop_bit((*root),bit_of_pop,1); 209 if(bit_of_pop[0] == '0') 210 { 211 father->point_of_zero = (*root); 212 } 213 else 214 { 215 father->point_of_one = (*root); 216 } 217 (*root) = father; 218 } 219 else 220 { 221 trie_node *father = (trie_node*)malloc(sizeof(trie_node)); 222 trie_node_init(&father); 223 //temp_of_new++; 224 pop_bit((*root),bit_of_pop,ans_of_compare); 225 compress(father,bit_of_pop); 226 pop_bit((*root),bit_of_pop,1); 227 bit_of_insert += ans_of_compare+1; 228 229 if(bit_of_pop[0] == '0') 230 { 231 father->point_of_zero = (*root); 232 father->point_of_one = (trie_node*)malloc(sizeof(trie_node)); 233 trie_node_init(&father->point_of_one); 234 //temp_of_new++; 235 ret = trie_insert(&father->point_of_one,bit_of_insert); 236 } 237 else 238 { 239 father->point_of_one = (*root); 240 father->point_of_zero = (trie_node*)malloc(sizeof(trie_node)); 241 trie_node_init(&father->point_of_zero); 242 //temp_of_new++; 243 ret = trie_insert(&father->point_of_zero,bit_of_insert); 244 } 245 (*root) = father; 246 } 247 } 248 } 249 } 250 return ret; 251 } 252 253 254 int trie_search(trie_node *root,char *bit_of_search) 255 { 256 trie_node *p = root; 257 while(p!=NULL && *bit_of_search) 258 { 259 if(p->num_of_bit!=0) 260 { 261 if((int)p->num_of_bit == compare_of_bit(p,bit_of_search)) 262 { 263 bit_of_search += (int)p->num_of_bit; 264 } 265 else 266 { 267 p=NULL; 268 break; 269 } 270 } 271 if(!(*bit_of_search)) 272 { 273 break; 274 } 275 if(bit_of_search[0]=='0') 276 { 277 p = p->point_of_zero; 278 bit_of_search++; 279 } 280 else if(bit_of_search[0]=='1') 281 { 282 p = p->point_of_one; 283 bit_of_search++; 284 } 285 if(!(*bit_of_search) && p!=NULL && p->num_of_bit!=0) 286 { 287 p=NULL; 288 break; 289 } 290 } 291 if(p!=NULL) 292 { 293 return p->is_str; 294 } 295 else 296 { 297 return 0; 298 } 299 } 300 301 302 void trie_delete(trie_node *root) 303 { 304 if(root == NULL) 305 return; 306 trie_delete(root->point_of_zero); 307 trie_delete(root->point_of_one); 308 free(root); 309 } 310 311 312 int compare_of_bit(trie_node *root,char* bit_of_insert) 313 { 314 int len_of_insert = strlen(bit_of_insert); 315 int i,j,tempbit; 316 if(root->num_of_bit<=32) 317 { 318 for(i=0,j=root->num_of_bit-1;i<len_of_insert && i<root->num_of_bit;i++,j--) 319 { 320 tempbit = getbit((int)root->compress_of_bit,j); 321 if(bit_of_insert[i]-'0' != tempbit) 322 { 323 break; 324 } 325 } 326 } 327 else 328 { 329 for(i=0,j=root->num_of_bit-1;i<len_of_insert && i<root->num_of_bit;i++,j--) 330 { 331 tempbit = getbit(root->compress_of_bit[j/8],j%8); 332 if(bit_of_insert[i]-'0' != tempbit) 333 { 334 break; 335 } 336 } 337 } 338 return i; 339 } 340 341 void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop) 342 { 343 int i,j; 344 short num_of_bit = root->num_of_bit - (short)len_of_pop; 345 if(root->num_of_bit<=32) 346 { 347 for(i=0,j=root->num_of_bit-1;i<len_of_pop;i++,j--) 348 { 349 bit_of_pop[i] = getbit((int)root->compress_of_bit,j) +'0'; 350 } 351 bit_of_pop[i] = '\0'; 352 } 353 else 354 { 355 for(i=0,j=root->num_of_bit-1;i<len_of_pop;i++,j--) 356 { 357 bit_of_pop[i] = getbit(root->compress_of_bit[j/8],j%8) +'0'; 358 } 359 bit_of_pop[i] = '\0'; 360 361 if(num_of_bit == 0) 362 { 363 free(root->compress_of_bit); 364 } 365 else if(num_of_bit<=32) 366 { 367 int temp; 368 for(j=num_of_bit-1;j>=0;j--) 369 { 370 if(getbit(root->compress_of_bit[j/8],j%8) == 0) 371 { 372 clearbit(temp,j); 373 } 374 else 375 { 376 setbit(temp,j); 377 } 378 } 379 free(root->compress_of_bit); 380 root->compress_of_bit = (unsigned char*)temp; 381 } 382 else 383 { 384 unsigned char *p; 385 short num_of_byte = (num_of_bit%8)?(num_of_bit/8+1):(num_of_bit/8); 386 if(((root->num_of_bit%8)?(root->num_of_bit/8+1):(root->num_of_bit/8)) != num_of_byte) 387 { 388 p = (unsigned char*)malloc(num_of_byte); 389 short i; 390 for(i=0;i<num_of_byte;i++) 391 { 392 p[i] = root->compress_of_bit[i]; 393 } 394 free(root->compress_of_bit); 395 root->compress_of_bit = p; 396 } 397 } 398 } 399 root->num_of_bit = num_of_bit; 400 }