搜索引擎源码及流程



对从网络上抓取到的网页进行处理:建立网络库,分词,去重,if-tdf计算权重,归一化,然后根据查询词将文本相似度从高到低的依次返回给客户

第一阶段:python网络爬虫抓取网页,并存盘

第二阶段:对磁盘上的网页文件建立网页库,将全部网页写入网页库,并建立相应网页的偏移量索引文件(1   23   100)-->(dofid,   offset,   size),以便读取网页内容

             注意的地方:1.写成格式:<doc>

                                                    <docid>1</docid> //对网页进行编号

                                                          <url>http://....</url> 

                                                          <title>...</title>

                                                  <content>....

                                                 </content>                     

                                           </doc>

                              2.处理每行的'\r\n',调用tellp()  也可以用FILE*的ftell()函数,打印当前文件偏移量,

 此部分代码:

pagelib.h

复制代码
 1 #ifndef _PAGELIB_H_
 2 #define _PAGELIB_H_
 3 #include <iostream>
 4 #include <string>
 5 #include <vector>
 6 #include <fstream>
 7 #include <unistd.h>
 8 #include <dirent.h>
 9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <time.h>
15 #include <pwd.h>
16 #include <grp.h>
17 
18 class pagelib
19 {
20     public:
21         pagelib(const std::string &dir, 
22                 const std::string &libname)
23             :dir_(dir),
24             libname_(libname)
25     {}
26         void readdirname()
27         {
28             readfile(dir_);
29         }
30         void store_to_lib(const std::string&);
31     private:
32         void readfile(const std::string &);
33         void store_to_vector(const std::string&);
34         std::string dir_;
35         std::string libname_;
36         static int i;
37         std::vector<std::string> vec;
38 };
39 
40 #endif /*PAGELIB_H*/ 
复制代码
View Code

pagelib.cpp

复制代码
 1 #include "pagelib.h"
 2 using namespace std;
 3 int pagelib::i = 1;
 4 
 5 void pagelib::readfile(const string &s)
 6 {
 7     DIR *dir;
 8     struct dirent * mydir;
 9     struct stat mystat;
10     char str[256];
11     dir=opendir(s.c_str());
12     if(!dir)
13     {
14         cout << s << endl;
15         exit(-1);
16     }
17 
18     while((mydir=readdir(dir))!=NULL)
19     {
20         sprintf(str, "%s/%s", s.c_str(), mydir->d_name);
21         stat(str, &mystat);
22         if(!strcmp(mydir->d_name, ".") || !strcmp(mydir->d_name, ".."))
23             continue;
24         if( mystat.st_mode & 0040000 ) 
25         {
26             string s1 = str;
27             readfile(s1);
28         }
29         else
30         {
31             cout << str << endl;
32             store_to_vector(str);
33         }
34     }
35 }
36 
37 void pagelib::store_to_vector(const string &str)
38 {
39     string content, line;
40     ifstream is(str.c_str());
41 
42     string title;
43     getline(is, title);
44     {
45         for (size_t i = 0; i < title.size(); i++) {
46             if(title[i] == '\r')
47                 title[i] = ' ';
48         }
49     }
50 
51     while(getline(is, line) > 0)
52     {
53         for (size_t i = 0; i < line.size(); i++) {
54             if(line[i] == '\r')
55                 line[i] = '\n';
56         }
57         content += line;
58     }
59     char s1[5];
60     sprintf(s1, "%d", i++);
61     string doc = string("<doc>\n  <docid>") + s1 + "</docid>\n   <url>" + string(str) 
62         + "</url>\n   <title>" + title + "</title>\n     <content>\n" 
63         + content + "     </content>\n</doc>\n\n";
64     vec.push_back(doc);
65     is.close();
66 }
67 
68 void pagelib::store_to_lib(const string& index)
69 {
70     ofstream os_lib(libname_.c_str());
71     ofstream os_index(index.c_str());
72     vector<string>::iterator it = vec.begin();
73     int j = 1;
74     os_lib << "<pagelib>\n" << endl;
75     for(; it != vec.end(); it++)
76     {
77         os_index << j++ << " " << os_lib.tellp() ;
78         os_lib << *it;
79         os_index << " " << (*it).size() << endl;
80     }
81     os_lib << "</pagelib>\n" << endl;
82     os_index.close();
83     os_lib.close();
84 }
复制代码
View Code


 

第3阶段:对网页库中的网页进行分词,去重,关键词计算权重,归一化等操作

         注意的问题:切词使用的是结巴切词:[email protected]:aa838260772/jieba.git 

           tf-idf计算权重,一个单词的权重跟在本文档中出现的次数成正比,跟出现此单词的文档数成反比

           关键词的选取:权重,去停用词,使用priority_queue输出权重最大10个单词

          文档关键词的归一化:余弦相似度,方便后面计算文本相似度。

          建立倒排索引:

          重新构建新的网络库和偏移索引:注意docid的变化,使用string.replace()函数

split.h

复制代码
 1 #ifndef _SPLIT_H_
 2 #define _SPLIT_H_
 3 #include <utility>
 4 #include "MixSegment.hpp"
 5 #include <map>
 6 #include <string>
 7 #include <fstream>
 8 #include <queue>
 9 #include <sstream>
10 #include <set>
11 #include <algorithm>
12 
13 typedef struct qelem
14 {
15     std::string word;
16     int frequence;
17     bool operator<(const qelem &other) const
18     {
19         return frequence < other.frequence;
20     }
21 }qelem;
22 
23 
24 class splitpagelib
25 {
26     public:
27         splitpagelib(const std::string &libname,
28                 const std::string &indexname,
29                 const std::string &stopfilename,
30                 const std::string &splitfilename
31                 )
32                 :libname_(libname),
33             indexname_(indexname),
34             stopfilename_(stopfilename),
35             splitfilename_(splitfilename)
36     {}
37         void startsplit(const std::string& dic_path, 
38                         const std::string& model_path);
39         void deletesame();
40         void newindexlib(const std::string&,
41                          const std::string&);
42         void make_reverse_index(const std::string& ,
43                                 const std::string&);
44     private:
45         void compute_power();
46         bool ifsame(const std::map<std::string, int> &,
47                 const std::map<std::string, int> &);
48         void topten(int, 
49                     std::ofstream &, 
50                     const std::map<std::string, int> &);
51         void weightogether();
52 
53         std::string libname_;//从lib文件读内容
54         std::string indexname_;//从index读每篇文档相应的offset,读出相应文章
55         std::string stopfilename_;//停用词文件
56         std::string splitfilename_;//分词后的文件 
57 
58         std::set<std::string> set_stop;//读入停用词
59         std::map<int, std::string> map_doc;//根据索引把每篇doc读入map
60         std::map<int, std::map<std::string, int> > map_lib;
61         std::map<std::string, int> words;//将去去重后的所有单词及频数写入map
62         std::map<int, std::map<std::string, double> >doc_word_wei;
63         std::map<std::string, std::map<int, double> >rev_index;//倒排索引
64         std::map<std::string, std::map<int, int> > map_word_fre;
65 };  
66 
67 #endif /*SPLIT_H*/ 
复制代码
View Code

split.cpp

复制代码
  1 #include "split.h"
  2 #include <stdio.h>
  3 using namespace std;
  4 using namespace CppJieba;
  5 
  6 void splitpagelib::startsplit(const string &dic_path, 
  7         const string &model_path)    
  8 {
  9     MixSegment  segment_(dic_path, model_path);
 10     //读入停用词到set_stop中去
 11     ifstream is_stop(stopfilename_.c_str());
 12     string stopword;
 13     while(getline(is_stop, stopword) > 0)
 14     {
 15         set_stop.insert(stopword);
 16     }
 17     //通过index来将lib中文档读出并分词,统计出现最高的10个 放入map中
 18     vector<string> words;//用来存放切割后的单词
 19     ifstream is_lib(libname_.c_str());
 20     ifstream is_index(indexname_.c_str());
 21     ofstream os_split(splitfilename_.c_str());
 22     string lib_line, index_line;
 23     cout << "begin to cut word :" << indexname_ 
 24          <<" " << libname_ << endl;
 25     while(getline(is_index, index_line) > 0)
 26     {
 27         map<string, int> map_;
 28         int docid, offset, size;
 29         istringstream ss(index_line);
 30         ss >> docid >> offset >> size;
 31         is_lib.seekg(offset);
 32         string every_doc = "";
 33         while(is_lib.tellg() < (offset + size))
 34         {
 35             getline(is_lib, lib_line);
 36             every_doc += lib_line + "\n";
 37             for(size_t i = 0; i < lib_line.size(); i++)
 38             {
 39                 if(!(lib_line[i] & (1 << 7)))
 40                     lib_line[i] = ' ';
 41             }
 42 
 43             segment_.cut(lib_line.c_str(), words);
 44             for(size_t i = 0; i < words.size(); i++)
 45             {
 46                 set<string>::iterator it = set_stop.find(words[i]);
 47                 if(it == set_stop.end() && words[i] != " ")//不去掉的话把空格当单词
 48                     ++map_[words[i]];
 49             }
 50             words.clear();
 51         }
 52         os_split << docid << endl;
 53         topten(docid, os_split, map_);
 54         os_split << endl;
 55         map_doc[docid] = every_doc;//将每篇文章读入map_doc
 56     }
 57     os_split.close();
 58     is_lib.close();
 59     is_index.close();
 60     cout << "cut word over :write to"  << splitfilename_ << endl;
 61 }
 62 
 63 void splitpagelib::topten(int docid,
 64         ofstream &os,
 65         const map<string, int> &map_)
 66 {
 67     priority_queue<qelem> prique_;
 68     qelem ql;
 69     for(map<string, int>::const_iterator it = map_.begin(); 
 70             it != map_.end(); 
 71             ++it)
 72     {
 73         ql.word = it->first;
 74         ql.frequence = it->second;
 75         prique_.push(ql);
 76     }
 77 
 78     int i = 0;
 79     while(i < 10 && !prique_.empty())
 80     {
 81         ql = prique_.top() ;
 82         map_lib[docid][ql.word] = ql.frequence;
 83         prique_.pop();
 84         i++;
 85         os << ql.word << " " << ql.frequence << " ";
 86     }
 87 }
 88 
 89 void splitpagelib::deletesame()
 90 {    //去重
 91     cout << "begin to delete the same" << endl;
 92     int *arr = new int [map_lib.size() + 1];
 93     for (size_t i = 1; i < map_lib.size() + 1; i++) 
 94     {
 95         arr[i] = 1;
 96     }
 97     size_t ix1, ix2;
 98     for(ix1 = 1; ix1 < map_lib.size() + 1; ++ix1)
 99     {
100         if(arr[ix1] == 0)
101             continue; 
102         for (ix2 = ix1 +1; ix2 < map_lib.size() + 1; ix2++) 
103         {
104             if(arr[ix2] == 0)
105                 continue;
106             if(ifsame(map_lib[ix1], map_lib[ix2]))
107             {
108                 arr[ix2] = 0;
109                 map_lib.erase(ix2);
110             }
111         }        
112     }
113     delete [] arr;
114     cout << "delsanme over" << endl;
115 }
116 
117 bool splitpagelib::ifsame(const map<string, int>& map1, 
118         const map<string, int>& map2)
119 {
120     int i = 0;
121     for(map<string, int>::const_iterator it = map2.begin(); it != map2.end(); ++it)
122     {
123         if(map1.count(it->first) > 0)
124             i++;
125     }
126     if(i > 6)
127         return true;
128     else
129         return false;
130 }
131 
132 void splitpagelib::newindexlib(const string& index,
133                         const string& lib)
134 {
135     ofstream os_index(index.c_str());
136     ofstream os_lib(lib.c_str());
137 
138     map<int, map<string, double> >::iterator it;
139     size_t i = 1;
140     cout << "begin write to index lib" << index 
141          << " " << lib <<  endl;
142     for(it = doc_word_wei.begin(); it != doc_word_wei.end(); it++)
143     {
144         os_index << i << " " << os_lib.tellp() << " " ;
145         
146         int begin = map_doc[it->first].find("<docid>");
147         int end = map_doc[it->first].find("</docid>");
148         char s[10] = {0};
149         sprintf(s, "%d", i);
150         map_doc[it->first].replace(begin + 7, end - begin - 7, s);
151         os_lib << map_doc[it->first] << endl;
152 
153         os_index << map_doc[it->first].size() << endl;//写index    
154         i++;
155     }
156     cout << "write over" << endl;
157     os_index.close();
158     os_lib.close();
159 }
160 
161 void splitpagelib::compute_power()
162 {
163     map<int, map<string, int> >::iterator it1;    
164     map<string, int>::iterator it2 ;
165     for(it1 = map_lib.begin(); it1 != map_lib.end(); it1++)
166     {
167         for(it2 = it1->second.begin(); it2 != it1->second.end(); ++it2)
168         {
169             map_word_fre[it2->first][it1->first] = it2->second;
170         }
171     }
172 
173     for(it1 = map_lib.begin(); it1 != map_lib.end(); it1++)
174     {
175         for(it2 = it1->second.begin(); it2 != it1->second.end(); ++it2)
176         {
177             double d = (it2->second) * log( (double)(map_word_fre.size()) / (double)(map_word_fre[it2->first].size() +1) );
178             doc_word_wei[it1->first][it2->first] = d;
179         }
180     }
181 }
182 
183 void splitpagelib::make_reverse_index(const string &index,
184                                         const string &power)
185 {
186     //计算每篇中的权重
187     compute_power();
188     //对每篇文档的词作归一化
189    weightogether();
190    //建立倒排索引
191    int j = 1;
192    for(map<int, map<string, double> >::iterator it = doc_word_wei.begin();
193            it != doc_word_wei.end();
194            it++)
195    {
196        for(map<string, double>::iterator it1 = it->second.begin(); 
197                it1 != it->second.end(); 
198                ++it1)
199        {
200            rev_index[it1->first][j] = it1->second;  
201        }
202        j++;
203    }
204     cout << "write reverse index" << index <<endl;
205     ofstream os_index(index.c_str());
206     for(map<string, map<int, double> >::iterator it = rev_index.begin(); 
207             it != rev_index.end(); 
208             ++it)
209     {
210         os_index << it->first << endl;
211         for(map<int, double> ::iterator it1 = it->second.begin(); 
212                 it1 != it->second.end(); 
213                 ++it1)
214         {
215             os_index << it1->first << " " << it1->second << " ";
216         }
217         os_index << endl;
218     }
219     os_index.close();
220     cout << "write reverse index over" << endl;
221 
222     cout << "write doc_word_wei begin" << power << endl;
223     ofstream os_power(power.c_str());
224     map<int, map<string, double> >::iterator it = doc_word_wei.begin();
225     map<string, double>::iterator it1;
226     int i = 1;
227     for(; it != doc_word_wei.end(); ++it)
228     {
229         os_power << i << endl;
230         for(it1 = it->second.begin(); it1 != it->second.end(); ++it1)
231         {
232             os_power << it1->first << " " << it1->second << " " ;
233         }
234         os_power << endl;
235         i++; 
236     }
237     os_power.close();
238     cout << "write power end" << endl;
239 }
240 
241 void splitpagelib::weightogether()//归一化
242 {
243     map<int, map<string, double> >::iterator it;
244     map<string, double>::iterator it1;
245     double s;
246     for(it = doc_word_wei.begin(); it != doc_word_wei.end(); it++)
247     {
248         s = 0;
249         for(it1 = it->second.begin(); it1 != it->second.end(); it1++)
250         {
251             s += it1->second * it1->second;
252         }
253 
254         for(it1 = it->second.begin(); it1 != it->second.end(); it1++)
255         {
256             it1->second /= sqrt(s);
257         }
258       }
259 }
复制代码
View Code

第4阶段:对查询语句进行分词,根据倒排索引找出文档的交集,将查询句的单词进行归一化,与查询结果计算文档相似度,从大到小返回个客户
        注意的地方:根据索引按行读文档时在读出的结果后面加上'\n',

query.h

 

复制代码
 1 #ifndef _QUERY_H_
 2 #define _QUERY_H_
 3 
 4 #include <map>
 5 #include "MixSegment.hpp"
 6 #include <string>
 7 #include <fstream>
 8 #include <sstream>
 9 #include <queue>
10 #include <algorithm>
11 
12 typedef struct simi
13 {
14     int docid;
15     double simidegre;
16     bool operator<( const simi &right)const
17     {
18         return simidegre < right.simidegre;
19     }
20 }simi;
21 
22 class query
23 {
24     public:
25         query(){}
26      void read_to_map(const std::string&,
27            const std::string&,
28            const std::string&,
29            const std::string&);
30      std::string search(const std::string&);
31     private:
32      static void similar(std::map<std::string, double>&,
33                          std::map<std::string, double>&,
34                          int,
35                          std::priority_queue<simi>&);
36      
37       std::string offsetfile_;
38       std::string reverseindexfile_;
39       std::string docfile_;
40       std::string doc_weifile_;
41      
42       std::map<std::string, std::map<int, double> > map_reindex;//存储倒排索引
43       std::map<int, std::map<std::string, double> > map_wei;
44       std::map<int, std::string> map_doc;
45 };
46 
47 #endif /*QUERY_H*/ 
复制代码
View Code

 

query.cpp

复制代码
  1 #include "query.h"
  2 
  3 using namespace std;
  4 using namespace CppJieba;
  5 
  6 void query::read_to_map(
  7          const string &offsetfile,
  8         const string &reverseindexfile,
  9         const string &docfile,
 10         const string &doc_weifile)
 11 {
 12   offsetfile_ = offsetfile;
 13   reverseindexfile_ = reverseindexfile;
 14   doc_weifile_ = doc_weifile;
 15   docfile_ = docfile;
 16 
 17     //读倒排索引文件
 18     ifstream is_revindex(reverseindexfile_.c_str());
 19     string line;
 20     while(getline(is_revindex, line) > 0)
 21     {
 22         string word;
 23         word = line;
 24         getline(is_revindex, line) ;
 25         istringstream sstream(line);
 26         int docid;
 27         double weight;
 28         while(sstream >> docid >> weight)
 29         {
 30             map_reindex[word][docid] = weight;
 31         }
 32     }
 33     is_revindex.close();
 34 
 35     //读偏移索引文件和doc文件,将每篇文档读入map
 36     ifstream is_offset(offsetfile_.c_str());
 37     ifstream is_doc(docfile_.c_str());
 38     string line_off;
 39     while(getline(is_offset, line_off) > 0)
 40     {
 41         istringstream sstream(line_off);
 42         int docid;
 43         int offset;
 44         int size;
 45         sstream >> docid >> offset >> size;
 46         is_doc.seekg(offset);
 47         string every_doc = "";
 48         while(is_doc.tellg() < (offset + size))
 49         {
 50             string line;
 51             getline(is_doc, line);
 52             every_doc += line + "\n";
 53         }
 54         map_doc[docid] = every_doc;
 55     }
 56     is_offset.close();
 57     is_doc.close();
 58     //读每篇doc归一化后的top10词文件
 59     ifstream is_map_wei(doc_weifile_.c_str());
 60     string line_wei;
 61     while(getline(is_map_wei, line_wei) > 0)
 62     {
 63         int docid;
 64         istringstream ss(line_wei);
 65         ss >> docid;
 66         getline(is_map_wei, line_wei);
 67         istringstream sstream(line_wei);
 68         string word;
 69         double weight;
 70         while(sstream >>  word >> weight)
 71         {
 72             map_wei[docid][word] = weight;
 73         }
 74     }
 75     is_map_wei.close();
 76 }
 77 
 78 string query::search(const std::string &word)
 79 {
 80     priority_queue<simi> result_doc;
 81     //切词,算出单词权重存储到一个map,相当于一片doc
 82     MixSegment segment("../dict/jieba.dict.utf8",
 83             "../dict/hmm_model.utf8");
 84     vector<string> words;
 85     map<string, double> map_que;
 86     segment.cut(word, words);
 87     double d = 0;
 88     for (size_t i = 0; i < words.size(); i++) 
 89     {
 90         double s = 1 * log( (double)map_reindex.size() / (double)(map_reindex[words[i]].size()+1) );
 91         cout << words[i] << s << endl;
 92         map_que[ words[i] ] = s;
 93         d += s*s;
 94     }
 95     for(size_t i = 0; i < words.size(); i++)
 96     {
 97         map_que[ words[i] ] /= sqrt(d);
 98     }
 99     //找出单词交集的docid 
100     map<int, double> com(map_reindex[ words[0] ]);
101 
102     for (size_t i = 1; i < words.size(); i++) 
103     {
104         if(map_reindex[ words[i] ].size() == 0)
105             continue;
106         for(map<int, double>::iterator it = com.begin(); it != com.end(); ++it)
107         {
108             if(map_reindex[ words[i] ].count(it->first) == 0)
109                 com.erase(it->first);
110         }
111     }
112     cout << com.size() << endl;
113     for(map<int, double>::iterator it = com.begin(); it != com.end(); ++it)
114     {
115         similar(map_que, map_wei[it->first], it->first, result_doc);
116     }
117 
118     simi si;
119     string result = "";
120     while(!result_doc.empty())
121     {
122         si = result_doc.top();
123         char s[20] = {0};
124         sprintf(s, "%d %f\n", si.docid, si.simidegre);
125         result += s + map_doc[si.docid];
126         result_doc.pop();
127     }
128     return result;
129 }
130 
131 void query::similar(map<string, double> &map_doc1,
132                     map<string, double> &map_doc2,
133                     int docid,
134                     priority_queue<simi> &result_doc)
135 {
136     double s = 0;
137     for(map<string, double>::iterator it = map_doc1.begin();
138                                     it != map_doc1.end();
139                                         ++it)
140     {
141         if(map_doc2.count(it->first) > 0)
142         {
143             s += it->second * map_doc2[it->first]; 
144         }
145     }
146     simi si;
147     si.docid = docid;
148     si.simidegre = s;
149     result_doc.push(si);
150 }
复制代码
View Code

 第5阶段:可将返回结果做成xml格式返回给客户,直接以网页格式显示,

        一点点想法:可以结合文本纠错,使得查询更加智能一点

你可能感兴趣的:(搜索引擎源码及流程)