在前面的内容中,通过对于词条的分析,排序、压缩等处理后,词条ENTRY目前是按照每个metaID一个LOCATION的结构进行存放。从本节开始,逐渐阐述索引文件的写入过程。
2.6索引文件写入过程
索引文件的写入主要是分为写入头部header过程和词条信息过程两部分。
2.6.1 write_header头部写入过程
先通过DB_InitWriteHeader_Native将header内容开始的文件偏移地址存放到DB->offsets[HEADERPOS]中。
注:
在查询的时候,先读取header,word信息等的文件偏移地址,然后再读取相关内容。
int DB_InitWriteHeader_Native(void *db) { struct Handle_DBNative *DB = (struct Handle_DBNative *) db; if(DB->offsets[HEADERPOS]) { /* If DB->offsets[HEADERPOS] is not 0 we are in update mode ** So, put the pointer file in the header start position to overwrite ** the header */ sw_fseek(DB->fp,DB->offsets[HEADERPOS],SEEK_SET); } else { /* The index file is being created. So put the header in the ** current file position (coincides with the end of the file */ DB->offsets[HEADERPOS] = sw_ftell(DB->fp); } return 0; }
然后写入INDEXHEADER_ID、INDEXVERSION_ID、NAMEHEADER_ID等信息,写入的基本方式为:
ID,ID字符串长度,ID字符串。最后写入一个0作为header的结束。
2.6.2 write_index过程
基本框架为:
void write_index(SWISH * sw, IndexFILE * indexf) { int i; ENTRYARRAY *ep; ENTRY *epi; int totalwords; int percent, lastPercent, n; int last_loc_swap; #define DELTA 10 if ( !(ep = sw->Index->entryArray )) return; /* nothing to do */ totalwords = ep->numWords; /*将word信息开始的文件偏移地址放入到 DB->offsets[WORDPOS] = sw_ftell(DB->fp)中*/ DB_InitWriteWords(sw, indexf->DB); n = lastPercent = 0; for (i = 0; i < totalwords; i++) { /*取出一个词条*/ epi = ep->elist[i]; /* why check for stopwords here? removestopwords could have remove them */ if ( !is_word_in_hash_table( indexf->header.hashstoplist, epi->word ) ) { /*不是stopword写入到索引文件中, word长度,word字符串*/ /* Write word to index file */ write_word(sw, epi, indexf); } else epi->u1.wordID = (sw_off_t)-1; /* flag as a stop word */ } n = lastPercent = 0; for (i = 0; i < VERYBIGHASHSIZE; i++) { /*遍历hashentries,写入word的hash值*/ if ((epi = sw->Index->hashentries[i])) { while (epi) { /* If it is not a stopword write it */ if (epi->u1.wordID > (sw_off_t)0) DB_WriteWordHash(sw, epi->word,epi->u1.wordID,indexf->DB); epi = epi->next; } } } if (sw->verbose) { printf("/r Writing word hash: Complete/n" ); printf(" Writing word data: ..."); fflush(stdout); } n = lastPercent = last_loc_swap = -1; for (i = 0; i < VERYBIGHASHSIZE; i++) { if ((epi = sw->Index->hashentries[i])) { while (epi) { /* If we are in economic mode -e we must sort locations by metaID, filenum */ if(sw->Index->swap_locdata) { sortSwapLocData(epi); } if (epi->u1.wordID > (sw_off_t)0) /* Not a stopword */ { /*写入到word的数据,即:词条频率、位置信息等*/ build_worddata(sw, epi); write_worddata(sw, epi, indexf); } epi = epi->next; } } } if (sw->verbose) printf("/r Writing word data: Complete/n" ); DB_EndWriteWords(sw, indexf->DB); /* free all ENTRY structs at once */ Mem_ZoneFree(&sw->Index->entryZone); /* free all location compressed data */ Mem_ZoneFree(&sw->Index->totalLocZone); efree(ep->elist); }