因为之前坐了一个网站,完成了简单的站内搜索的功能,但是效率太低,每次都要从数据库中提取数据出来。于是花了一点时间改善了这个站内搜索的功能,通过倒排索引和正排索引来实现搜索功能。
注意:本项目用到的库都需要自行安装(cppjieba,jsoncpp,brpc,gflags,protobuf,leveldb),后三个是brpc要求安装的
思路:
一.首先实现一个制作索引的线下程序
正排索引文件
倒排索引文件
下面是制作索引的程序代码
word_segmentation.hpp(分词头文件)
#pragma once
#include "/home/pigff/third-part-lib/cppjieba/include/cppjieba/Jieba.hpp"
#include
#include
#include
using std::cout;
using std::endl;
using std::string;
using std::vector;
//最大概率法(MPSegment: Max Probability)分词所使用的词典路径
const char * const DICT_PATH = "/home/pigff/third-part-lib/cppjieba/dict/jieba.dict.utf8";
//隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典路径
const char * const HMM_PATH = "/home/pigff/third-part-lib/cppjieba/dict/hmm_model.utf8";
//用户自定义词典路径
const char * const USER_DICT_PATH = "/home/pigff/third-part-lib/cppjieba/dict/user.dict.utf8";
//IDF路径
const char* const IDF_PATH = "/home/pigff/third-part-lib/cppjieba/dict/idf.utf8";
//停用词路径
const char* const STOP_WORD_PATH = "/home/pigff/third-part-lib/cppjieba/dict/stop_words.utf8";
class WordSegmentation//使用结巴分词库进行分词
{
public:
WordSegmentation()
:_jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH,IDF_PATH,STOP_WORD_PATH)
{}
vector operator()(const string str){
//返回str的分词结果
vector words;
_jieba.CutAll(str, words);//FullSegment
return words;
}
private:
cppjieba::Jieba _jieba;
};
buind_index.cc(制作索引程序)
#include "word_segmentation.hpp"
#include
#include
#include
#include
#include
#include
#include
#include
struct All_News_Info{
vector ids;
vector titles;
vector authors;
vector contents;
vector dates;
};
//从数据库拿取数据
//数据来源是新闻的标题和内容
void SearchData(All_News_Info& infos){
MYSQL* conn;
conn = mysql_init(NULL);
//设置连接的字符集为utf8,不然没法显示中文
mysql_set_character_set(conn,"utf8");
if(conn == NULL)
cout << "Error " << mysql_errno(conn) << ": " << mysql_error(conn);
if(mysql_real_connect(conn,"localhost","root","1","news",0,NULL,0) == NULL)
cout << "Error " << mysql_errno(conn) << ": " << mysql_error(conn);
MYSQL_RES* result;
MYSQL_ROW row;
mysql_query(conn,"select id,title,author,content,createdate from news");
result = mysql_store_result(conn);
while((row = mysql_fetch_row(result))){
infos.ids.push_back(row[0]);
infos.titles.push_back(row[1]);
infos.authors.push_back(row[2]);
infos.contents.push_back(row[3]);
infos.dates.push_back(row[4]);
}
}
//存储索引(正排索引和倒排索引)
//正排索引(文档id->文档的全部信息,用来查看文档中关键词的出现次数)
//根据关键词在不同文档中的出现次数,排序搜索结果,次数越高的关联越大)
//倒排索引(关键词->文档id,用来查看关键词在哪些文档中出现过)
void Save_index(const All_News_Info& infos){
WordSegmentation wordSeg;
vector results;
std::unordered_map> inverted_index;
std::unordered_map> forward_index;
for(size_t i = 0;i < infos.titles.size();++i){
//把有关标题,作者和内容的分词结果放到倒排索引中
results = wordSeg(infos.titles[i]);
for(auto it:results)
inverted_index[it].insert(infos.ids[i]);
results = wordSeg(infos.contents[i]);
for(auto it:results)
inverted_index[it].insert(infos.ids[i]);
results = wordSeg(infos.authors[i]);
for(auto it:results)
inverted_index[it].insert(infos.ids[i]);
//把对应id下的所有信息放到正排索引中
forward_index[infos.ids[i]]["title"] = infos.titles[i];
forward_index[infos.ids[i]]["author"] = infos.authors[i];
forward_index[infos.ids[i]]["content"] = infos.contents[i];
forward_index[infos.ids[i]]["date"] = infos.dates[i];
}
//将json数据保存到文件中
Json::Value root1,root2;
for(auto it:inverted_index){
string str = "";
for(auto it2 :it.second)
str += it2 + " ";
root1[it.first] = str;
}
for(auto it:forward_index){
Json::Value partner;
for(auto it2:it.second)
partner[it2.first] = it2.second;
root2[it.first] = partner;
}
Json::StyledWriter sw;
std::ofstream os1,os2;
os1.open("inverted_index.json");
os2.open("forward_index.json");
os1 << sw.write(root1);
os2 << sw.write(root2);
os1.close();
os2.close();
}
int main(){
All_News_Info infos;
SearchData(infos);
Save_index(infos);
return 0;
}
二.封装对索引文件的操作接口(为之后的索引服务器提供)
下面是设计接口的程序代码
search_engine.h
#pragma once
#include
#include
#include
#include
#include
#include
#include
#include "word_segmentation.hpp"
using std::string;
struct News_Info{
string title;
string author;
string date;
News_Info(string title_,string author_,string date_)
:title(title_),
author(author_),
date(date_)
{}
};
void Load( std::unordered_map>& inverted_index,
std::unordered_map>& forward_index);
bool Find(std::unordered_map> inverted_index,
string keyword,std::unordered_map& ids);
vector > Sort(std::unordered_map ids);
vector ReturnInfo(const vector >& sort_ids,
std::unordered_map> forward_index);
search_engine.cc
#include "search_engine.h"
//将索引文件加载到哈希表
void Load( std::unordered_map>& inverted_index,
std::unordered_map>& forward_index){
Json::Reader reader;
Json::Value value;
std::ifstream is1,is2;
is1.open("/home/pigff/project/search_engine/inverted_index.json");
is2.open("/home/pigff/project/search_engine/forward_index.json");
reader.parse(is1,value);
vector names = value.getMemberNames();
for(auto it:names){
vector v;
string tmp = value[it].asString();
boost::algorithm::split(v,tmp,boost::algorithm::is_space());
for(auto it2:v)
inverted_index[it].insert(it2);
}
value.clear();
reader.parse(is2,value);
names.clear();
names = value.getMemberNames();
for(auto it:names){
vector names_names = value[it].getMemberNames();
for(auto it2:names_names)
forward_index[it].insert(make_pair(it2,value[it][it2].asString()));
}
}
//对一个搜索的关键词进行分词
bool Find(std::unordered_map> inverted_index,string keyword,std::unordered_map& ids){
//对给进来的关键字进行分词
WordSegmentation wordSeg;
vector results = wordSeg(keyword);
for(auto it:results){
if(inverted_index[it].empty())
continue;
else{
for(auto it2: inverted_index[it]){
//下面的判断是因为对文件的分割结果有可能有一个是空
if(!it2.empty())
ids[it2]++;
}
}
}
if(ids.empty())
return false;
return true;
}
//对搜索出的新闻结果按照匹配度排序
//本质上就是对一个map进行按value的排序
//因为sort只可以对顺序容器进行排序
//所以我们要先把数据存到一个vector中
//采用的方式是对sort算法采用一个仿函数Compare
class Compare{
public:
bool operator()(const std::pair& x, const std::pair& y) {
return x.second < y.second;
}
};
vector > Sort(std::unordered_map ids){
vector > ret(ids.begin(),ids.end());
sort(ret.begin(),ret.end(),Compare());
return ret;
}
//根据最终排序好的id取出对于的结构体数组
vector ReturnInfo(const vector >& sort_ids,
std::unordered_map> forward_index){
vector ret;
for(auto i:sort_ids){
News_Info news_info(forward_index[i.first]["title"],
forward_index[i.first]["author"],
forward_index[i.first]["date"]);
ret.push_back(news_info);
}
return ret;
}
三.制作搜索引擎服务器以及搜索客户端接口
下面是具体程序代码
SG.proto
syntax = "proto2";
// 告诉protoc要生成++ Service基类
option cc_generic_services = true;
package SG; // 定义了package的名字
message Request {
optional string keyword = 1;
};
message Response {
repeated Info info = 1;
};
message Info{
optional string title = 1;
optional string author = 2;
optional string date = 3;
};
service Service {
rpc Search(Request) returns (Response);
};
server.cc
#include
#include
#include
#include
#include "/home/pigff/project/search_engine/search_engine.h"
#include "SG.pb.h"
DEFINE_bool(attachment, true, "Echo attachment as well");
DEFINE_int32(port, 9999, "TCP Port of this server");
DEFINE_int32(idle_timeout_s, -1, "Connection will be closed if there is no "
"read/write operations during the last `idle_timeout_s'");
DEFINE_int32(logoff_ms, 2000, "Maximum duration of server's LOGOFF state "
"(waiting for client to close connection before server stops)");
namespace Search{
//实现proto中的Service基类
class SearchService : public SG::Service{
public:
SearchService(){
//在构造函数中调用Load加载好索引文件
Load(inverted_index,forward_index);
}
void Search(google::protobuf::RpcController* cntl_base,
const SG::Request* req,
SG::Response* resp,
google::protobuf::Closure* done){
//这个对象确保在return时自动调用done->Run()
brpc::ClosureGuard done_guard(done);
brpc::Controller* cntl = static_cast(cntl_base);
//输出日志了解客户端如何与服务器交互
LOG(INFO) << "Received request[log_id=" << cntl->log_id()
<< "] from " << cntl->remote_side()
<< " to " << cntl->local_side()
<< ": " << req->keyword()
<< " (attached=" << cntl->request_attachment() << ")";
//在倒排哈希表中根据关键词分词查找
std::unordered_map ids;
// 如果找不到则响应的几个参数都不设置,都为空
if(Find(inverted_index,req->keyword(),ids) == false)
return;
//对搜索出的结果进行关联度排序
vector > sort_ids = Sort(ids);
//取出排序好的id的相关内容
vector infos = ReturnInfo(sort_ids,forward_index);
for(size_t i = 0;i < infos.size();++i)
resp->add_info();
//给响应写东西
for(int i = 0;i < resp->info_size();++i){
SG::Info* info = resp->mutable_info(i);
info->set_title(infos[i].title);
info->set_author(infos[i].author);
info->set_date(infos[i].date);
}
if(FLAGS_attachment){
//设置连接到网络的附件而不是被序列化的protobuf信息
cntl->response_attachment().append(cntl->request_attachment());
}
}
private:
std::unordered_map> inverted_index;
std::unordered_map> forward_index;
};
}//end namespace
int main(int argc,char* argv[]){
daemon(1,1);
//解析GFLAGS
gflags::ParseCommandLineFlags(&argc,&argv,true);
//服务器对象
brpc::Server server;
//proto服务的实例
Search::SearchService search_service;
//将服务添加到服务器中
//第二个参数是因为服务放在堆栈上,我们不希望服务器删除它
//如果想要删除可以用brpc::SERVER_OWNS_SERVICE
if(server.AddService(&search_service,brpc::SERVER_DOESNT_OWN_SERVICE) != 0){
LOG(ERROR) << "Fail to start SearchServer";
return -1;
}
// Start the server.
brpc::ServerOptions option;
option.idle_timeout_sec = FLAGS_idle_timeout_s;
if (server.Start(FLAGS_port, &option) != 0) {
LOG(ERROR) << "Fail to start EchoServer";
return -1;
}
//直到按下Ctrl-c,才停止服务器
server.RunUntilAskedToQuit();
return 0;
}
client.h
#pragma once
#include
#include
#include
#include
#include
#include
#include "/home/pigff/project/search_server/SG.pb.h"
#include "/home/pigff/project/search_engine/search_engine.h"
using std::string;
DECLARE_string(protocol);
DECLARE_string(search_attachment);
DECLARE_string(connection_type);
DECLARE_string(search_server);
DECLARE_string(load_balancer);
DECLARE_int32(timeout_ms);
DECLARE_int32(max_retry);
DECLARE_int32(interval_ms);
DECLARE_string(http_content_type);
class Client{
public:
Client();
vector Return(string keyword);
private:
//客户端对象
brpc::Channel channel;
//proto服务的实例
brpc::ChannelOptions options;
};
client.cc
#include "client.h"
DEFINE_string(protocol, "baidu_std", "Protocol type. Defined in src/brpc/options.proto");
DEFINE_string(search_attachment, "foo", "Carry this along with requests");
DEFINE_string(connection_type, "", "Connection type. Available values: single, pooled, short");
DEFINE_string(search_server, "0.0.0.0:9999", "IP Address of server");
DEFINE_string(load_balancer, "", "The algorithm for load balancing");
DEFINE_int32(timeout_ms, 3000, "RPC timeout in milliseconds");
DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
DEFINE_int32(interval_ms, 5000, "Milliseconds between consecutive requests");
DEFINE_string(http_content_type, "application/json", "Content type of http request");
Client::Client(){
options.protocol = FLAGS_protocol;
options.connection_type = FLAGS_connection_type;
options.timeout_ms = FLAGS_timeout_ms;
options.max_retry = FLAGS_max_retry;
channel.Init(FLAGS_search_server.c_str(),FLAGS_load_balancer.c_str(),&options);
}
vector Client::Return(string keyword){
// Normally, you should not call a Channel directly, but instead construct
// a stub Service wrapping it. stub can be shared by all threads as well.
SG::Service_Stub stub(&channel);
SG::Request req;
SG::Response resp;
brpc::Controller cntl;
req.set_keyword(keyword);
if (FLAGS_protocol != "http" && FLAGS_protocol != "h2c") {
// Set attachment which is wired to network directly instead of
// being serialized into protobuf messages.
cntl.request_attachment().append(FLAGS_search_attachment);
} else {
cntl.http_request().set_content_type(FLAGS_http_content_type);
}
// Because `done'(last parameter) is NULL, this function waits until
// the response comes back or error occurs(including timedout).
stub.Search(&cntl, &req, &resp, NULL);
vector v;
if (!cntl.Failed()) {
//写回数据给客户端
for(auto i: resp.info()){
News_Info info(i.title(),i.author(),i.date());
v.push_back(info);
}
}else
LOG(WARNING) << cntl.ErrorText();
return v;
}