对从网络上抓取到的网页进行处理:建立网络库,分词,去重,if-tdf计算权重,归一化,然后根据查询词将文本相似度从高到低的依次返回给客户
第一阶段:python网络爬虫抓取网页,并存盘
第二阶段:对磁盘上的网页文件建立网页库,将全部网页写入网页库,并建立相应网页的偏移量索引文件(1 23 100)-->(dofid, offset, size),以便读取网页内容
注意的地方:1.写成格式:<doc>
<docid>1</docid> //对网页进行编号
<url>http://....</url>
<title>...</title>
<content>....
</content>
</doc>
2.处理每行的'\r\n',调用tellp() 也可以用FILE*的ftell()函数,打印当前文件偏移量,
此部分代码:
pagelib.h
1 #ifndef _PAGELIB_H_ 2 #define _PAGELIB_H_ 3 #include <iostream> 4 #include <string> 5 #include <vector> 6 #include <fstream> 7 #include <unistd.h> 8 #include <dirent.h> 9 #include <sys/types.h> 10 #include <sys/stat.h> 11 #include <stdio.h> 12 #include <stdlib.h> 13 #include <string.h> 14 #include <time.h> 15 #include <pwd.h> 16 #include <grp.h> 17 18 class pagelib 19 { 20 public: 21 pagelib(const std::string &dir, 22 const std::string &libname) 23 :dir_(dir), 24 libname_(libname) 25 {} 26 void readdirname() 27 { 28 readfile(dir_); 29 } 30 void store_to_lib(const std::string&); 31 private: 32 void readfile(const std::string &); 33 void store_to_vector(const std::string&); 34 std::string dir_; 35 std::string libname_; 36 static int i; 37 std::vector<std::string> vec; 38 }; 39 40 #endif /*PAGELIB_H*/
pagelib.cpp
1 #include "pagelib.h" 2 using namespace std; 3 int pagelib::i = 1; 4 5 void pagelib::readfile(const string &s) 6 { 7 DIR *dir; 8 struct dirent * mydir; 9 struct stat mystat; 10 char str[256]; 11 dir=opendir(s.c_str()); 12 if(!dir) 13 { 14 cout << s << endl; 15 exit(-1); 16 } 17 18 while((mydir=readdir(dir))!=NULL) 19 { 20 sprintf(str, "%s/%s", s.c_str(), mydir->d_name); 21 stat(str, &mystat); 22 if(!strcmp(mydir->d_name, ".") || !strcmp(mydir->d_name, "..")) 23 continue; 24 if( mystat.st_mode & 0040000 ) 25 { 26 string s1 = str; 27 readfile(s1); 28 } 29 else 30 { 31 cout << str << endl; 32 store_to_vector(str); 33 } 34 } 35 } 36 37 void pagelib::store_to_vector(const string &str) 38 { 39 string content, line; 40 ifstream is(str.c_str()); 41 42 string title; 43 getline(is, title); 44 { 45 for (size_t i = 0; i < title.size(); i++) { 46 if(title[i] == '\r') 47 title[i] = ' '; 48 } 49 } 50 51 while(getline(is, line) > 0) 52 { 53 for (size_t i = 0; i < line.size(); i++) { 54 if(line[i] == '\r') 55 line[i] = '\n'; 56 } 57 content += line; 58 } 59 char s1[5]; 60 sprintf(s1, "%d", i++); 61 string doc = string("<doc>\n <docid>") + s1 + "</docid>\n <url>" + string(str) 62 + "</url>\n <title>" + title + "</title>\n <content>\n" 63 + content + " </content>\n</doc>\n\n"; 64 vec.push_back(doc); 65 is.close(); 66 } 67 68 void pagelib::store_to_lib(const string& index) 69 { 70 ofstream os_lib(libname_.c_str()); 71 ofstream os_index(index.c_str()); 72 vector<string>::iterator it = vec.begin(); 73 int j = 1; 74 os_lib << "<pagelib>\n" << endl; 75 for(; it != vec.end(); it++) 76 { 77 os_index << j++ << " " << os_lib.tellp() ; 78 os_lib << *it; 79 os_index << " " << (*it).size() << endl; 80 } 81 os_lib << "</pagelib>\n" << endl; 82 os_index.close(); 83 os_lib.close(); 84 }
第3阶段:对网页库中的网页进行分词,去重,关键词计算权重,归一化等操作
注意的问题:切词使用的是结巴切词:[email protected]:aa838260772/jieba.git
tf-idf计算权重,一个单词的权重跟在本文档中出现的次数成正比,跟出现此单词的文档数成反比
关键词的选取:权重,去停用词,使用priority_queue输出权重最大10个单词
文档关键词的归一化:余弦相似度,方便后面计算文本相似度。
建立倒排索引:
重新构建新的网络库和偏移索引:注意docid的变化,使用string.replace()函数
split.h
1 #ifndef _SPLIT_H_ 2 #define _SPLIT_H_ 3 #include <utility> 4 #include "MixSegment.hpp" 5 #include <map> 6 #include <string> 7 #include <fstream> 8 #include <queue> 9 #include <sstream> 10 #include <set> 11 #include <algorithm> 12 13 typedef struct qelem 14 { 15 std::string word; 16 int frequence; 17 bool operator<(const qelem &other) const 18 { 19 return frequence < other.frequence; 20 } 21 }qelem; 22 23 24 class splitpagelib 25 { 26 public: 27 splitpagelib(const std::string &libname, 28 const std::string &indexname, 29 const std::string &stopfilename, 30 const std::string &splitfilename 31 ) 32 :libname_(libname), 33 indexname_(indexname), 34 stopfilename_(stopfilename), 35 splitfilename_(splitfilename) 36 {} 37 void startsplit(const std::string& dic_path, 38 const std::string& model_path); 39 void deletesame(); 40 void newindexlib(const std::string&, 41 const std::string&); 42 void make_reverse_index(const std::string& , 43 const std::string&); 44 private: 45 void compute_power(); 46 bool ifsame(const std::map<std::string, int> &, 47 const std::map<std::string, int> &); 48 void topten(int, 49 std::ofstream &, 50 const std::map<std::string, int> &); 51 void weightogether(); 52 53 std::string libname_;//从lib文件读内容 54 std::string indexname_;//从index读每篇文档相应的offset,读出相应文章 55 std::string stopfilename_;//停用词文件 56 std::string splitfilename_;//分词后的文件 57 58 std::set<std::string> set_stop;//读入停用词 59 std::map<int, std::string> map_doc;//根据索引把每篇doc读入map 60 std::map<int, std::map<std::string, int> > map_lib; 61 std::map<std::string, int> words;//将去去重后的所有单词及频数写入map 62 std::map<int, std::map<std::string, double> >doc_word_wei; 63 std::map<std::string, std::map<int, double> >rev_index;//倒排索引 64 std::map<std::string, std::map<int, int> > map_word_fre; 65 }; 66 67 #endif /*SPLIT_H*/
split.cpp
1 #include "split.h" 2 #include <stdio.h> 3 using namespace std; 4 using namespace CppJieba; 5 6 void splitpagelib::startsplit(const string &dic_path, 7 const string &model_path) 8 { 9 MixSegment segment_(dic_path, model_path); 10 //读入停用词到set_stop中去 11 ifstream is_stop(stopfilename_.c_str()); 12 string stopword; 13 while(getline(is_stop, stopword) > 0) 14 { 15 set_stop.insert(stopword); 16 } 17 //通过index来将lib中文档读出并分词,统计出现最高的10个 放入map中 18 vector<string> words;//用来存放切割后的单词 19 ifstream is_lib(libname_.c_str()); 20 ifstream is_index(indexname_.c_str()); 21 ofstream os_split(splitfilename_.c_str()); 22 string lib_line, index_line; 23 cout << "begin to cut word :" << indexname_ 24 <<" " << libname_ << endl; 25 while(getline(is_index, index_line) > 0) 26 { 27 map<string, int> map_; 28 int docid, offset, size; 29 istringstream ss(index_line); 30 ss >> docid >> offset >> size; 31 is_lib.seekg(offset); 32 string every_doc = ""; 33 while(is_lib.tellg() < (offset + size)) 34 { 35 getline(is_lib, lib_line); 36 every_doc += lib_line + "\n"; 37 for(size_t i = 0; i < lib_line.size(); i++) 38 { 39 if(!(lib_line[i] & (1 << 7))) 40 lib_line[i] = ' '; 41 } 42 43 segment_.cut(lib_line.c_str(), words); 44 for(size_t i = 0; i < words.size(); i++) 45 { 46 set<string>::iterator it = set_stop.find(words[i]); 47 if(it == set_stop.end() && words[i] != " ")//不去掉的话把空格当单词 48 ++map_[words[i]]; 49 } 50 words.clear(); 51 } 52 os_split << docid << endl; 53 topten(docid, os_split, map_); 54 os_split << endl; 55 map_doc[docid] = every_doc;//将每篇文章读入map_doc 56 } 57 os_split.close(); 58 is_lib.close(); 59 is_index.close(); 60 cout << "cut word over :write to" << splitfilename_ << endl; 61 } 62 63 void splitpagelib::topten(int docid, 64 ofstream &os, 65 const map<string, int> &map_) 66 { 67 priority_queue<qelem> prique_; 68 qelem ql; 69 for(map<string, int>::const_iterator it = map_.begin(); 70 it != map_.end(); 71 ++it) 72 { 73 ql.word = it->first; 74 ql.frequence = it->second; 75 prique_.push(ql); 76 } 77 78 int i = 0; 79 while(i < 10 && !prique_.empty()) 80 { 81 ql = prique_.top() ; 82 map_lib[docid][ql.word] = ql.frequence; 83 prique_.pop(); 84 i++; 85 os << ql.word << " " << ql.frequence << " "; 86 } 87 } 88 89 void splitpagelib::deletesame() 90 { //去重 91 cout << "begin to delete the same" << endl; 92 int *arr = new int [map_lib.size() + 1]; 93 for (size_t i = 1; i < map_lib.size() + 1; i++) 94 { 95 arr[i] = 1; 96 } 97 size_t ix1, ix2; 98 for(ix1 = 1; ix1 < map_lib.size() + 1; ++ix1) 99 { 100 if(arr[ix1] == 0) 101 continue; 102 for (ix2 = ix1 +1; ix2 < map_lib.size() + 1; ix2++) 103 { 104 if(arr[ix2] == 0) 105 continue; 106 if(ifsame(map_lib[ix1], map_lib[ix2])) 107 { 108 arr[ix2] = 0; 109 map_lib.erase(ix2); 110 } 111 } 112 } 113 delete [] arr; 114 cout << "delsanme over" << endl; 115 } 116 117 bool splitpagelib::ifsame(const map<string, int>& map1, 118 const map<string, int>& map2) 119 { 120 int i = 0; 121 for(map<string, int>::const_iterator it = map2.begin(); it != map2.end(); ++it) 122 { 123 if(map1.count(it->first) > 0) 124 i++; 125 } 126 if(i > 6) 127 return true; 128 else 129 return false; 130 } 131 132 void splitpagelib::newindexlib(const string& index, 133 const string& lib) 134 { 135 ofstream os_index(index.c_str()); 136 ofstream os_lib(lib.c_str()); 137 138 map<int, map<string, double> >::iterator it; 139 size_t i = 1; 140 cout << "begin write to index lib" << index 141 << " " << lib << endl; 142 for(it = doc_word_wei.begin(); it != doc_word_wei.end(); it++) 143 { 144 os_index << i << " " << os_lib.tellp() << " " ; 145 146 int begin = map_doc[it->first].find("<docid>"); 147 int end = map_doc[it->first].find("</docid>"); 148 char s[10] = {0}; 149 sprintf(s, "%d", i); 150 map_doc[it->first].replace(begin + 7, end - begin - 7, s); 151 os_lib << map_doc[it->first] << endl; 152 153 os_index << map_doc[it->first].size() << endl;//写index 154 i++; 155 } 156 cout << "write over" << endl; 157 os_index.close(); 158 os_lib.close(); 159 } 160 161 void splitpagelib::compute_power() 162 { 163 map<int, map<string, int> >::iterator it1; 164 map<string, int>::iterator it2 ; 165 for(it1 = map_lib.begin(); it1 != map_lib.end(); it1++) 166 { 167 for(it2 = it1->second.begin(); it2 != it1->second.end(); ++it2) 168 { 169 map_word_fre[it2->first][it1->first] = it2->second; 170 } 171 } 172 173 for(it1 = map_lib.begin(); it1 != map_lib.end(); it1++) 174 { 175 for(it2 = it1->second.begin(); it2 != it1->second.end(); ++it2) 176 { 177 double d = (it2->second) * log( (double)(map_word_fre.size()) / (double)(map_word_fre[it2->first].size() +1) ); 178 doc_word_wei[it1->first][it2->first] = d; 179 } 180 } 181 } 182 183 void splitpagelib::make_reverse_index(const string &index, 184 const string &power) 185 { 186 //计算每篇中的权重 187 compute_power(); 188 //对每篇文档的词作归一化 189 weightogether(); 190 //建立倒排索引 191 int j = 1; 192 for(map<int, map<string, double> >::iterator it = doc_word_wei.begin(); 193 it != doc_word_wei.end(); 194 it++) 195 { 196 for(map<string, double>::iterator it1 = it->second.begin(); 197 it1 != it->second.end(); 198 ++it1) 199 { 200 rev_index[it1->first][j] = it1->second; 201 } 202 j++; 203 } 204 cout << "write reverse index" << index <<endl; 205 ofstream os_index(index.c_str()); 206 for(map<string, map<int, double> >::iterator it = rev_index.begin(); 207 it != rev_index.end(); 208 ++it) 209 { 210 os_index << it->first << endl; 211 for(map<int, double> ::iterator it1 = it->second.begin(); 212 it1 != it->second.end(); 213 ++it1) 214 { 215 os_index << it1->first << " " << it1->second << " "; 216 } 217 os_index << endl; 218 } 219 os_index.close(); 220 cout << "write reverse index over" << endl; 221 222 cout << "write doc_word_wei begin" << power << endl; 223 ofstream os_power(power.c_str()); 224 map<int, map<string, double> >::iterator it = doc_word_wei.begin(); 225 map<string, double>::iterator it1; 226 int i = 1; 227 for(; it != doc_word_wei.end(); ++it) 228 { 229 os_power << i << endl; 230 for(it1 = it->second.begin(); it1 != it->second.end(); ++it1) 231 { 232 os_power << it1->first << " " << it1->second << " " ; 233 } 234 os_power << endl; 235 i++; 236 } 237 os_power.close(); 238 cout << "write power end" << endl; 239 } 240 241 void splitpagelib::weightogether()//归一化 242 { 243 map<int, map<string, double> >::iterator it; 244 map<string, double>::iterator it1; 245 double s; 246 for(it = doc_word_wei.begin(); it != doc_word_wei.end(); it++) 247 { 248 s = 0; 249 for(it1 = it->second.begin(); it1 != it->second.end(); it1++) 250 { 251 s += it1->second * it1->second; 252 } 253 254 for(it1 = it->second.begin(); it1 != it->second.end(); it1++) 255 { 256 it1->second /= sqrt(s); 257 } 258 } 259 }
第4阶段:对查询语句进行分词,根据倒排索引找出文档的交集,将查询句的单词进行归一化,与查询结果计算文档相似度,从大到小返回个客户
注意的地方:根据索引按行读文档时在读出的结果后面加上'\n',
query.h
1 #ifndef _QUERY_H_ 2 #define _QUERY_H_ 3 4 #include <map> 5 #include "MixSegment.hpp" 6 #include <string> 7 #include <fstream> 8 #include <sstream> 9 #include <queue> 10 #include <algorithm> 11 12 typedef struct simi 13 { 14 int docid; 15 double simidegre; 16 bool operator<( const simi &right)const 17 { 18 return simidegre < right.simidegre; 19 } 20 }simi; 21 22 class query 23 { 24 public: 25 query(){} 26 void read_to_map(const std::string&, 27 const std::string&, 28 const std::string&, 29 const std::string&); 30 std::string search(const std::string&); 31 private: 32 static void similar(std::map<std::string, double>&, 33 std::map<std::string, double>&, 34 int, 35 std::priority_queue<simi>&); 36 37 std::string offsetfile_; 38 std::string reverseindexfile_; 39 std::string docfile_; 40 std::string doc_weifile_; 41 42 std::map<std::string, std::map<int, double> > map_reindex;//存储倒排索引 43 std::map<int, std::map<std::string, double> > map_wei; 44 std::map<int, std::string> map_doc; 45 }; 46 47 #endif /*QUERY_H*/
query.cpp
1 #include "query.h" 2 3 using namespace std; 4 using namespace CppJieba; 5 6 void query::read_to_map( 7 const string &offsetfile, 8 const string &reverseindexfile, 9 const string &docfile, 10 const string &doc_weifile) 11 { 12 offsetfile_ = offsetfile; 13 reverseindexfile_ = reverseindexfile; 14 doc_weifile_ = doc_weifile; 15 docfile_ = docfile; 16 17 //读倒排索引文件 18 ifstream is_revindex(reverseindexfile_.c_str()); 19 string line; 20 while(getline(is_revindex, line) > 0) 21 { 22 string word; 23 word = line; 24 getline(is_revindex, line) ; 25 istringstream sstream(line); 26 int docid; 27 double weight; 28 while(sstream >> docid >> weight) 29 { 30 map_reindex[word][docid] = weight; 31 } 32 } 33 is_revindex.close(); 34 35 //读偏移索引文件和doc文件,将每篇文档读入map 36 ifstream is_offset(offsetfile_.c_str()); 37 ifstream is_doc(docfile_.c_str()); 38 string line_off; 39 while(getline(is_offset, line_off) > 0) 40 { 41 istringstream sstream(line_off); 42 int docid; 43 int offset; 44 int size; 45 sstream >> docid >> offset >> size; 46 is_doc.seekg(offset); 47 string every_doc = ""; 48 while(is_doc.tellg() < (offset + size)) 49 { 50 string line; 51 getline(is_doc, line); 52 every_doc += line + "\n"; 53 } 54 map_doc[docid] = every_doc; 55 } 56 is_offset.close(); 57 is_doc.close(); 58 //读每篇doc归一化后的top10词文件 59 ifstream is_map_wei(doc_weifile_.c_str()); 60 string line_wei; 61 while(getline(is_map_wei, line_wei) > 0) 62 { 63 int docid; 64 istringstream ss(line_wei); 65 ss >> docid; 66 getline(is_map_wei, line_wei); 67 istringstream sstream(line_wei); 68 string word; 69 double weight; 70 while(sstream >> word >> weight) 71 { 72 map_wei[docid][word] = weight; 73 } 74 } 75 is_map_wei.close(); 76 } 77 78 string query::search(const std::string &word) 79 { 80 priority_queue<simi> result_doc; 81 //切词,算出单词权重存储到一个map,相当于一片doc 82 MixSegment segment("../dict/jieba.dict.utf8", 83 "../dict/hmm_model.utf8"); 84 vector<string> words; 85 map<string, double> map_que; 86 segment.cut(word, words); 87 double d = 0; 88 for (size_t i = 0; i < words.size(); i++) 89 { 90 double s = 1 * log( (double)map_reindex.size() / (double)(map_reindex[words[i]].size()+1) ); 91 cout << words[i] << s << endl; 92 map_que[ words[i] ] = s; 93 d += s*s; 94 } 95 for(size_t i = 0; i < words.size(); i++) 96 { 97 map_que[ words[i] ] /= sqrt(d); 98 } 99 //找出单词交集的docid 100 map<int, double> com(map_reindex[ words[0] ]); 101 102 for (size_t i = 1; i < words.size(); i++) 103 { 104 if(map_reindex[ words[i] ].size() == 0) 105 continue; 106 for(map<int, double>::iterator it = com.begin(); it != com.end(); ++it) 107 { 108 if(map_reindex[ words[i] ].count(it->first) == 0) 109 com.erase(it->first); 110 } 111 } 112 cout << com.size() << endl; 113 for(map<int, double>::iterator it = com.begin(); it != com.end(); ++it) 114 { 115 similar(map_que, map_wei[it->first], it->first, result_doc); 116 } 117 118 simi si; 119 string result = ""; 120 while(!result_doc.empty()) 121 { 122 si = result_doc.top(); 123 char s[20] = {0}; 124 sprintf(s, "%d %f\n", si.docid, si.simidegre); 125 result += s + map_doc[si.docid]; 126 result_doc.pop(); 127 } 128 return result; 129 } 130 131 void query::similar(map<string, double> &map_doc1, 132 map<string, double> &map_doc2, 133 int docid, 134 priority_queue<simi> &result_doc) 135 { 136 double s = 0; 137 for(map<string, double>::iterator it = map_doc1.begin(); 138 it != map_doc1.end(); 139 ++it) 140 { 141 if(map_doc2.count(it->first) > 0) 142 { 143 s += it->second * map_doc2[it->first]; 144 } 145 } 146 simi si; 147 si.docid = docid; 148 si.simidegre = s; 149 result_doc.push(si); 150 }
第5阶段:可将返回结果做成xml格式返回给客户,直接以网页格式显示,
一点点想法:可以结合文本纠错,使得查询更加智能一点