大规模字符串检索-压缩trie树

本文使用压缩trie树实现字符串检索的功能。首先将字符串通过编码转化为二进制串,随后将二进制串插入到trie树中,在插入过程中同时实现压缩的功能。

字符编码采用Huffman,但最终测试发现不采用Huffman的方法不仅省下了编码时间,同时trie树的插入时间也有所减少。

  1 /**

  2     程序主函数与编码

  3 */

  4 #include <stdio.h>

  5 #include <stdlib.h>

  6 #include <string.h>

  7 #include "huffman.h"

  8 #include "compress_trie.h"

  9 //#include <time.h>

 10 

 11 #define NUM_OF_HUFFMAN 81

 12 #define LENGTH_OF_LINE 10000

 13 #define RESULT_OF_HUFFMAN "result_of_HUFFMAN.dat"

 14 //#define EMAIL "strpool.dat"

 15 //#define CHECKED_EMAIL "checkedemail.dat"

 16 #define RESULT "result.dat"

 17 

 18 void str_to_bin(char buf[],char binary[],huffman_node hufm[]);

 19 

 20 

 21 int main(int argc, char *argv[])

 22 {

 23     //time_t time_start,time_end;

 24     //time_start = time(NULL);

 25 

 26     char* EMAIL = argv[1];

 27     char* CHECKED_EMAIL = argv[2];

 28 

 29     huffman_node hufm[NUM_OF_HUFFMAN];

 30     hufm_init(hufm,NUM_OF_HUFFMAN);

 31     char buf[LENGTH_OF_LINE];

 32     char binary[LENGTH_OF_LINE];

 33 

 34     FILE* fin_of_huffman;

 35     fin_of_huffman = fopen(RESULT_OF_HUFFMAN,"r");

 36     if(fin_of_huffman == NULL)

 37     {

 38         hufm_init(hufm,NUM_OF_HUFFMAN);

 39         int i;

 40         for(i=0;i<(NUM_OF_HUFFMAN+1)/2;i++)

 41         {

 42             hufm[i].num_of_ch = NUM_OF_HUFFMAN - i;

 43         }

 44         huffman_coding(hufm,NUM_OF_HUFFMAN);

 45     }

 46     else

 47     {

 48         char temp_char;

 49         int i;

 50         for(i=0;i<(NUM_OF_HUFFMAN+1)/2;i++)

 51         {

 52             fgets(buf,sizeof(buf),fin_of_huffman);

 53             sscanf(buf,"%c %d %s",&temp_char,&hufm[i].num_of_ch,hufm[i].code);

 54         }

 55     }

 56     fclose(fin_of_huffman);

 57 

 58     printf("building trie...");

 59     FILE* fin_of_email;

 60     fin_of_email = fopen(EMAIL,"r");

 61     trie_node *root;

 62     root = (trie_node*)malloc(sizeof(trie_node));

 63     trie_node_init(&root);

 64 

 65     while(fgets(buf,sizeof(buf),fin_of_email)!=NULL)

 66     {

 67         str_to_bin(buf,binary,hufm);

 68         trie_insert(&root,binary);

 69     }

 70     fclose(fin_of_email);

 71     printf("\r");

 72     printf("build trie success.\n");

 73 

 74     FILE *fin_of_checked,*fout_of_result;

 75     fin_of_checked = fopen(CHECKED_EMAIL,"r");

 76     fout_of_result = fopen(RESULT,"w");

 77     int num_yes = 0;

 78     int num_no = 0;

 79     while(fgets(buf,sizeof(buf),fin_of_checked)!=NULL)

 80     {

 81         str_to_bin(buf,binary,hufm);

 82         if(trie_search(root,binary))

 83         {

 84             fprintf(fout_of_result,"YES\n");

 85             num_yes++;

 86         }

 87         else

 88         {

 89             fprintf(fout_of_result,"NO\n");

 90             num_no++;

 91         }

 92     }

 93     fprintf(fout_of_result,"num of YES is:%d\n",num_yes);

 94     fprintf(fout_of_result,"num of NO is:%d\n",num_no);

 95     printf("search success!\n");

 96     fclose(fin_of_checked);

 97     fclose(fout_of_result);

 98     //time_end = time(NULL);

 99     //printf("用时:%.0lfs\n", difftime(time_end, time_start));

100     return 0;

101 }

102 

103 

104 void str_to_bin(char buf[],char binary[],huffman_node hufm[])

105 {

106     int i;

107     binary[0] = '\0';

108     for(i=strlen(buf)-1;i>=0;i--)

109     {

110         if(buf[i]>='a' && buf[i]<='z')

111         {

112             strcat(binary,hufm[buf[i]-'a'].code);

113         }

114         else if(buf[i]>='A' && buf[i]<='Z')

115         {

116             strcat(binary,hufm[buf[i]-'A'].code);

117         }

118         else if(buf[i]>='0' && buf[i]<='9')

119         {

120             strcat(binary,hufm[26+buf[i]-'0'].code);

121         }

122         else if(buf[i]=='_')

123         {

124             strcat(binary,hufm[36].code);

125         }

126         else if(buf[i]=='-')

127         {

128             strcat(binary,hufm[37].code);

129         }

130         else if(buf[i]=='.')

131         {

132             strcat(binary,hufm[38].code);

133         }

134         else if(buf[i]=='@')

135         {

136             strcat(binary,hufm[39].code);

137         }

138         else

139         {

140             strcat(binary,hufm[40].code);

141         }

142     }

143 }
  1 /**

  2     完成trie树的插入,查找。

  3 */

  4 

  5 typedef struct TRIE_NODE

  6 {

  7     char is_str;

  8     unsigned short num_of_bit;

  9     unsigned char* compress_of_bit;

 10     struct TRIE_NODE *point_of_zero,*point_of_one;

 11 }trie_node;

 12 

 13 //long int temp_of_new = 0;

 14 

 15 

 16 void trie_node_init(trie_node **root);

 17 int trie_insert(trie_node **root,char* bit_of_insert);

 18 int trie_search(trie_node *root,char* bit_of_insert);

 19 void trie_delete(trie_node *root);

 20 void compress(trie_node *root,char* bit_of_insert);

 21 int compare_of_bit(trie_node *root,char* bit_of_insert);

 22 void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop);

 23 

 24 

 25 

 26 

 27 void trie_node_init(trie_node **root)

 28 {

 29     (*root)->is_str = (char)0;

 30     (*root)->num_of_bit = 0;

 31     (*root)->compress_of_bit = NULL;

 32     (*root)->point_of_zero = NULL;

 33     (*root)->point_of_one = NULL;

 34 }

 35 

 36 void compress(trie_node *root,char* bit_of_insert)

 37 {

 38     int i,j,len_of_insert;

 39     len_of_insert = strlen(bit_of_insert);

 40     root->num_of_bit = len_of_insert;

 41     if(root->num_of_bit<=32)

 42     {

 43         int temp;

 44         for(i=len_of_insert-1,j=0;i>=0;i--,j++)

 45         {

 46             if(bit_of_insert[i] == '0')

 47             {

 48                 clearbit(temp,j);

 49             }

 50             else

 51             {

 52                 setbit(temp,j);

 53             }

 54         }

 55         root->compress_of_bit = (unsigned char*)temp;

 56     }

 57     else

 58     {

 59         root->compress_of_bit = (unsigned char*)malloc((len_of_insert%8)?(len_of_insert/8+1):(len_of_insert/8));

 60         for(i=len_of_insert-1,j=0;i>=0;i--,j++)

 61         {

 62             if(bit_of_insert[i] == '0')

 63             {

 64                 clearbit(root->compress_of_bit[j/8],j%8);

 65             }

 66             else

 67             {

 68                 setbit(root->compress_of_bit[j/8],j%8);

 69             }

 70         }

 71     }

 72 }

 73 

 74 

 75 int trie_insert(trie_node **root,char* bit_of_insert)

 76 {

 77     int ret;

 78     char bit_of_pop[10000];

 79     if(root == NULL)

 80     {

 81         ret = 0;

 82     }

 83     else

 84     {

 85         if((*root)->num_of_bit == 0)

 86         {

 87             if(!(*bit_of_insert))

 88             {

 89                 (*root)->is_str = (char)1;

 90                 ret = 1;

 91             }

 92             else

 93             {

 94                 if((*root)->is_str == 0

 95                    && (*root)->point_of_zero == NULL

 96                    && (*root)->point_of_one == NULL)

 97                 {

 98                     compress((*root),bit_of_insert);

 99                     (*root)->is_str = (char)1;

100                     ret = 1;

101                 }

102                 else

103                 {

104                     if(*bit_of_insert == '0')

105                     {

106                         if((*root)->point_of_zero == NULL)

107                         {

108                             (*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));

109                             trie_node_init(&(*root)->point_of_zero);

110                             //temp_of_new++;

111                         }

112                         ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+1);

113                     }

114                     else

115                     {

116                         if((*root)->point_of_one == NULL)

117                         {

118                             (*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));

119                             trie_node_init(&(*root)->point_of_one);

120                             //temp_of_new++;

121                         }

122                         ret = trie_insert(&(*root)->point_of_one,bit_of_insert+1);

123                     }

124                 }

125             }

126         }

127         else

128         {

129             int ans_of_compare = compare_of_bit((*root),bit_of_insert);

130             if(ans_of_compare == 0)

131             {

132                 trie_node *father = (trie_node*)malloc(sizeof(trie_node));

133                 trie_node_init(&father);

134                 //temp_of_new++;

135                 pop_bit((*root),bit_of_pop,1);

136                 if(bit_of_pop[0] == '0')

137                 {

138                     father->point_of_zero = (*root);

139                 }

140                 else

141                 {

142                     father->point_of_one = (*root);

143                 }

144                 if(!(*bit_of_insert))

145                 {

146                     father->is_str = (char)1;

147                     ret = 1;

148                 }

149                 else

150                 {

151                     if(*bit_of_insert == '0')

152                     {

153                         father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));

154                         trie_node_init(&father->point_of_zero);

155                         //temp_of_new++;

156                         ret = trie_insert(&father->point_of_zero,bit_of_insert+1);

157                     }

158                     else

159                     {

160                         father->point_of_one = (trie_node*)malloc(sizeof(trie_node));

161                         trie_node_init(&father->point_of_one);

162                         //temp_of_new++;

163                         ret = trie_insert(&father->point_of_one,bit_of_insert+1);

164                     }

165                 }

166                 (*root) = father;

167             }

168             else

169             {

170                 if(ans_of_compare == (int)(*root)->num_of_bit

171                    && ans_of_compare == strlen(bit_of_insert))

172                 {

173                     (*root)->is_str = (char)1;

174                     ret = 1;

175                 }

176                 else if(ans_of_compare == (int)(*root)->num_of_bit)

177                 {

178                     bit_of_insert += ans_of_compare;

179                     if(*bit_of_insert == '0')

180                     {

181                         if((*root)->point_of_zero == NULL)

182                         {

183                             (*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));

184                             trie_node_init(&(*root)->point_of_zero);

185                             //temp_of_new++;

186                         }

187                         ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+1);

188                     }

189                     else

190                     {

191                         if((*root)->point_of_one == NULL)

192                         {

193                             (*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));

194                             trie_node_init(&(*root)->point_of_one);

195                             //temp_of_new++;

196                         }

197                         ret = trie_insert(&(*root)->point_of_one,bit_of_insert+1);

198                     }

199                 }

200                 else if(ans_of_compare == strlen(bit_of_insert))

201                 {

202                     trie_node *father = (trie_node*)malloc(sizeof(trie_node));

203                     trie_node_init(&father);

204                     //temp_of_new++;

205                     pop_bit((*root),bit_of_pop,ans_of_compare);

206                     compress(father,bit_of_pop);

207                     father->is_str = (char)1;

208                     pop_bit((*root),bit_of_pop,1);

209                     if(bit_of_pop[0] == '0')

210                     {

211                         father->point_of_zero = (*root);

212                     }

213                     else

214                     {

215                         father->point_of_one = (*root);

216                     }

217                     (*root) = father;

218                 }

219                 else

220                 {

221                     trie_node *father = (trie_node*)malloc(sizeof(trie_node));

222                     trie_node_init(&father);

223                     //temp_of_new++;

224                     pop_bit((*root),bit_of_pop,ans_of_compare);

225                     compress(father,bit_of_pop);

226                     pop_bit((*root),bit_of_pop,1);

227                     bit_of_insert += ans_of_compare+1;

228 

229                     if(bit_of_pop[0] == '0')

230                     {

231                         father->point_of_zero = (*root);

232                         father->point_of_one = (trie_node*)malloc(sizeof(trie_node));

233                         trie_node_init(&father->point_of_one);

234                         //temp_of_new++;

235                         ret = trie_insert(&father->point_of_one,bit_of_insert);

236                     }

237                     else

238                     {

239                         father->point_of_one = (*root);

240                         father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));

241                         trie_node_init(&father->point_of_zero);

242                         //temp_of_new++;

243                         ret = trie_insert(&father->point_of_zero,bit_of_insert);

244                     }

245                     (*root) = father;

246                 }

247             }

248         }

249     }

250     return ret;

251 }

252 

253 

254 int trie_search(trie_node *root,char *bit_of_search)

255 {

256     trie_node *p = root;

257     while(p!=NULL && *bit_of_search)

258     {

259         if(p->num_of_bit!=0)

260         {

261             if((int)p->num_of_bit == compare_of_bit(p,bit_of_search))

262             {

263                 bit_of_search += (int)p->num_of_bit;

264             }

265             else

266             {

267                 p=NULL;

268                 break;

269             }

270         }

271         if(!(*bit_of_search))

272         {

273             break;

274         }

275         if(bit_of_search[0]=='0')

276         {

277             p = p->point_of_zero;

278             bit_of_search++;

279         }

280         else if(bit_of_search[0]=='1')

281         {

282             p = p->point_of_one;

283             bit_of_search++;

284         }

285         if(!(*bit_of_search) && p!=NULL && p->num_of_bit!=0)

286         {

287             p=NULL;

288             break;

289         }

290     }

291     if(p!=NULL)

292     {

293         return p->is_str;

294     }

295     else

296     {

297         return 0;

298     }

299 }

300 

301 

302 void trie_delete(trie_node *root)

303 {

304     if(root == NULL)

305         return;

306     trie_delete(root->point_of_zero);

307     trie_delete(root->point_of_one);

308     free(root);

309 }

310 

311 

312 int compare_of_bit(trie_node *root,char* bit_of_insert)

313 {

314     int len_of_insert = strlen(bit_of_insert);

315     int i,j,tempbit;

316     if(root->num_of_bit<=32)

317     {

318         for(i=0,j=root->num_of_bit-1;i<len_of_insert && i<root->num_of_bit;i++,j--)

319         {

320             tempbit = getbit((int)root->compress_of_bit,j);

321             if(bit_of_insert[i]-'0' != tempbit)

322             {

323                 break;

324             }

325         }

326     }

327     else

328     {

329         for(i=0,j=root->num_of_bit-1;i<len_of_insert && i<root->num_of_bit;i++,j--)

330         {

331             tempbit = getbit(root->compress_of_bit[j/8],j%8);

332             if(bit_of_insert[i]-'0' != tempbit)

333             {

334                 break;

335             }

336         }

337     }

338     return i;

339 }

340 

341 void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop)

342 {

343     int i,j;

344     short num_of_bit = root->num_of_bit - (short)len_of_pop;

345     if(root->num_of_bit<=32)

346     {

347         for(i=0,j=root->num_of_bit-1;i<len_of_pop;i++,j--)

348         {

349             bit_of_pop[i] = getbit((int)root->compress_of_bit,j) +'0';

350         }

351         bit_of_pop[i] = '\0';

352     }

353     else

354     {

355         for(i=0,j=root->num_of_bit-1;i<len_of_pop;i++,j--)

356         {

357             bit_of_pop[i] = getbit(root->compress_of_bit[j/8],j%8) +'0';

358         }

359         bit_of_pop[i] = '\0';

360 

361         if(num_of_bit == 0)

362         {

363             free(root->compress_of_bit);

364         }

365         else if(num_of_bit<=32)

366         {

367             int temp;

368             for(j=num_of_bit-1;j>=0;j--)

369             {

370                 if(getbit(root->compress_of_bit[j/8],j%8) == 0)

371                 {

372                     clearbit(temp,j);

373                 }

374                 else

375                 {

376                     setbit(temp,j);

377                 }

378             }

379             free(root->compress_of_bit);

380             root->compress_of_bit = (unsigned char*)temp;

381         }

382         else

383         {

384             unsigned char *p;

385             short num_of_byte = (num_of_bit%8)?(num_of_bit/8+1):(num_of_bit/8);

386             if(((root->num_of_bit%8)?(root->num_of_bit/8+1):(root->num_of_bit/8)) != num_of_byte)

387             {

388                 p = (unsigned char*)malloc(num_of_byte);

389                 short i;

390                 for(i=0;i<num_of_byte;i++)

391                 {

392                     p[i] = root->compress_of_bit[i];

393                 }

394                 free(root->compress_of_bit);

395                 root->compress_of_bit = p;

396             }

397         }

398     }

399     root->num_of_bit = num_of_bit;

400 }

 

你可能感兴趣的:(trie)