百度,搜狐,360等搜索引擎;
boost的官网是没有站内搜索的。
爬虫程序我就不做了,受国家的法律法规的限制,我就通过正规的下载途径来做。
目标文档进行分词(目的:方便建立倒排索引和查找):
关键字(具有唯一性) | 文档,weight(权重) |
雷军 | 文档1,文档2 |
买 | 文档1 |
四斤 | 文档1 |
小米 | 文档1,文档2 |
四斤小米 | 文档1 |
发布 | 文档2 |
小米手机 | 文档2 |
模拟一次查找的过程:
#include
#include
#include
#include
#include"util.hpp"
const std::string src_path = "data/input/"; //所有的html
const std::string output = "data/raw_html/raw.txt"; //解析所有完的html
typedef struct DocInfo54
{
std::string title; //文档的标题
std::string content; //文档的内容
std::string url; //该文档在官网的url
}DocInfo_t;
bool EnumFile(const std::string &src_path,std::vector *file_list);
bool ParseHtml(std::vector& file_list,std::vector *results);
bool SaveHtml(std::vector& results,const std::string &output);
static bool ParseTitle(const std::string &result,std::string *title)
{
size_t begin = result.find("");
if(begin == std::string::npos)
{
return false;
}
size_t end = result.find(" ");
if(end == std::string::npos)
{
return false;
}
begin += std::string("").size();
if(begin > end)
{
return false;
}
*title = result.substr(begin,end - begin);
return true;
}
static bool ParseContent(const std::string &file,std::string *content)
{
//去标签,基于一个简单的状态机
enum status
{
LABLE,
CONTENT
};
enum status s = LABLE;
for(auto e :file)
{
switch (s)
{
case LABLE:
if(e == '>') //代表结束
s = CONTENT;
/* code */
break;
case CONTENT:
if(e == '<') //代表开始
s = LABLE;
else
{
if(e == '\n') e = ' ';
*content += e;
}
break;
default:
break;
}
}
return true;
}
static bool ParseUrl(const std::string &file,std::string *url)
{
std::string url_head = "https://www.boost.org/doc/libs/1_79_0/doc/html/";
std::string url_tail = file.substr(src_path.size());
*url = url_head + url_tail;
return true;
}
int main()
{
//第一步拿到所有文件名
std::vector files_list;
if(!EnumFile(src_path,&files_list))
{
std::cerr<<"enum file name error"< results;
if(!ParseHtml(files_list,&results))
{
std::cerr<<"parse is error"< *file_list) //拿到所有html文件名
{
namespace fs = boost::filesystem;
fs::path root_path(src_path); //创建一个路径名对象
if(!fs::exists(root_path)) //根据路径创建的对象不存在
{
std::cerr<path().extension() != ".html")
{
continue;
}
//测试
//std::cout<<"debug"<path().string()<push_back(it->path().string());
}
return true;
}
void ShowInfo(const DocInfo_t &doc)
{
std::cout<& file_list,std::vector *results)//拿到所有html的标题,内容,url
{
for(const auto file : file_list)
{
//1读取文件
std::string result;
if(!ns_util::FileUtil::ReadFile(file,&result))
{
//文件读取失败
continue;
}
DocInfo_t doc;
//2提取标签
if(!ParseTitle(result,&doc.title))
{
continue;;
}
//3提取内容
if(!ParseContent(result,&doc.content))
{
continue;
}
//4提取url
if(!ParseUrl(file,&doc.url))
{
continue;
}
//将结果出入到vector,这里有拷贝问题,以后在优化
results->push_back(std::move(doc)); //采用右值,资源转移
//for debug
//ShowInfo(doc);
//break;
}
return true;
}
bool SaveHtml(std::vector& results,const std::string &output)
{
#define SEP '\3'
std::ofstream of(output,std::ios::out | std::ios::binary);
if(!of.is_open())
{
std::cerr<<"open"<
在进行遍历的时候,只要碰到了 > ,就意味着,当前的标签被处理完毕. 只要碰到了 < 意味着新的标签开始了
构建URL
将解析内容写入文件中
#pragma once
#include
#include
#include
#include
#include
#include "util.hpp"
#include
#include"log.hpp"
namespace ns_index
{
struct DocInfo
{
std::string title; //文档标题
std::string content; //文档对应的去标签之后的内容
std::string url; //官网的url
uint64_t doc_id; //文档的id
};
struct InvertedElem
{
uint64_t doc_id;
std::string word;
int weigth;
};
//倒排拉链
typedef std::vector InvertedList;
class Index
{
private:
std::vector forward_index; //正排索引
std::unordered_map inverted_index; //倒排索引
static Index *Instance;
static std::mutex mtx;
private:
Index() = default;
Index(const Index &) = delete;
Index &operator=(const Index &) = delete;
public:
~Index() = default;
static Index *GetInstance()
{
if (nullptr == Instance)
{
mtx.lock();
if (nullptr == Instance)
{
Instance = new Index();
}
mtx.unlock();
}
return Instance;
}
//根据doc_id找到文档内容
DocInfo *GetForWardIndex(uint64_t doc_id)
{
if (doc_id >= forward_index.size())
{
std::cerr << "doc_id is error" << std::endl;
return nullptr;
}
return &forward_index[doc_id];
}
//根据关键字string,获得倒排拉链
InvertedList *GetInvertedList(const std::string &word)
{
auto it = inverted_index.find(word);
if (it == inverted_index.end())
{
std::cerr << word << "have no InvertedList" << std::endl;
return nullptr;
}
return &(it->second);
}
//根据去标签,格式化之后的文档,构建正排索引和倒排索引
// data/raw_html/raw.txt
bool BuildIndex(const std::string &input)
{
std::ifstream in(input, std::ios::in | std::ios::binary);
if (!in.is_open())
{
std::cerr << "sorry" << input << "open sorry" << std::endl;
return false;
}
std::string line;
int count = 0;
while (std::getline(in, line))
{
DocInfo *doc = BuildForwardIndex(line); //构建正排
if (doc == nullptr)
{
std::cerr << "build" << line << std::endl; // for debug
continue;
}
BuildInvertedIndex(*doc);
count++;
if(count % 50 == 0) //std::cout<<"当前已经建立的索引文档:"< results;
const std::string seq = "\3";
ns_util::StringUtil::Split(line, &results, seq);
if (results.size() != 3)
{
return nullptr;
}
// 2将字符串进行填充到DocInfo
DocInfo doc;
doc.title = results[0];
doc.content = results[1];
doc.url = results[2];
doc.doc_id = forward_index.size();
// 3插入到正排索引的vector中
forward_index.push_back(std::move(doc));
return &forward_index.back();
}
bool BuildInvertedIndex(const DocInfo &doc)
{
// word 倒排拉链
struct word_cnt
{
/* data */
int title_cnt;
int content_cnt;
word_cnt() : title_cnt(0), content_cnt(0) {}
};
std::unordered_map word_map;
//对标题进行分词
std::vector title_words;
ns_util::JiebaUtil::CurString(doc.title, &title_words);
for (auto s : title_words)
{
boost::to_lower(s); //转化成小写
word_map[s].title_cnt++;
}
//对文档内容进行分词
std::vector contnet_word;
ns_util::JiebaUtil::CurString(doc.content, &contnet_word);
for (auto s : contnet_word)
{
boost::to_lower(s); //转化成小写
word_map[s].content_cnt++;
}
#define X 10
#define Y 1
for (auto &word_pair : word_map)
{
InvertedElem item;
item.doc_id = doc.doc_id;
item.word = word_pair.first;
//相关性
item.weigth = X * word_pair.second.title_cnt + Y * word_pair.second.content_cnt;
InvertedList &inverted_list = inverted_index[word_pair.first];
inverted_list.push_back(std::move(item));
}
return true;
}
};
Index* Index::Instance = nullptr;
std::mutex Index::mtx;
}
#pragma once
#include "index.hpp"
#include
#include
// struct Com
// {
// bool operator>(const InvertedElem& e1,const InvertedElem& e2)
// {
// return e1.weigth > e2.weigth;
// }
// }
struct InvertedElemPrint
{
uint64_t doc_id;
int weight;
std::vector words;
InvertedElemPrint() : doc_id(0), weight(0) {}
};
namespace ns_searcher
{
class Searcher
{
private:
ns_index::Index *index;
public:
Searcher() = default;
~Searcher() = default;
void InitSearcher(const std::string &input)
{
// 1.获取或者创建index对象
index = ns_index::Index::GetInstance(); //获得单例
//std::cout << "获取单例成功" << std::endl;
LOG(NORMAL, "获取index单例成功...");
// 2.根据index对象建立索引
index->BuildIndex(input);
// std::cout << "建立正排和倒排索引成功...." << std::endl;
LOG(NORMAL, "建立正排和倒排索引成功...");
}
std::string GetDesc(const std::string &html_src, const std::string &word)
{
const int prev_step = 50;
const int next_step = 100;
//找到首次出现的位置
// std::size_t pos = html_src.find(word); //错误原文档没有忽略大小写
auto it = std::search(html_src.begin(), html_src.end(), word.begin(), word.end(),
[](int a, int b)
{ return std::tolower(a) == std::tolower(b); });
int pos = std::distance(html_src.begin(), it);
if (pos == std::string::npos)
{
return "None1"; //不存在这种情况
}
// 2获取start end
int start = 0;
int end = html_src.size() - 1;
if (pos > start + prev_step)
start = pos - prev_step;
if (pos < end - next_step)
end = pos + next_step;
if (start >= end)
return "None2";
return html_src.substr(start, end - start) + "...";
}
// query:搜索关键字
// josn_string:返回给用户的搜索结果
void Search(const std::string &query, std::string *json_string)
{
// 1.[分词]:对我们的query进行按照searcher的要求进行分词
std::vector words;
ns_util::JiebaUtil::CurString(query, &words);
// 2.[触发]:就是根据分词的各个“词”,进行Index查找
// ns_index::InvertedList inverted_list_all;
std::vector inverted_list_all;
std::unordered_map tokens_map;
for (auto &e : words)
{
boost::to_lower(e);
ns_index::InvertedList *inverted_list = index->GetInvertedList(e);
if (inverted_list == nullptr)
continue;
//不完美的地方,可能有重复的文档
// inverted_list_all.insert(inverted_list_all.end(),inverted_list->begin(),inverted_list->end());
for (const auto &elem : *inverted_list)
{
auto &item = tokens_map[elem.doc_id]; //[]:如果存在直接获取,如果不存在新建
// item一定是doc_id相同的print节点
item.doc_id = elem.doc_id;
item.weight += elem.weigth;
item.words.push_back(elem.word);
}
for (const auto &elem : *inverted_list)
{
auto &item = tokens_map[elem.doc_id]; //[]:如果存在直接获取,如果不存在新建
// item一定是doc_id相同的print节点
item.doc_id = elem.doc_id;
item.weight += elem.weigth;
item.words.push_back(elem.word);
}
for (const auto &item : tokens_map)
{
inverted_list_all.push_back(std::move(item.second));
}
}
// 3.[合并排序]:汇总查找结果,按照相关性(weight)降序排序
// std::sort(inve rted_list_all.begin(), inverted_list_all.end(),\
// []( const ns_index::InvertedElem e1, const ns_index::InvertedElem e2){
// return e1.weigth > e2.weigth;
// });
// std::sort(inverted_list_all.begin(),inverted_list_all.end(),Com());
std::sort(inverted_list_all.begin(), inverted_list_all.end(),
[](const InvertedElemPrint &e1, const InvertedElemPrint &e2)
{
return e1.weight > e2.weight;
});
// 4.[构建]:根据查找出来的结果,构建jsonc串 -----jsoncpp
Json::Value root;
for (auto &item : inverted_list_all)
{
ns_index::DocInfo *doc = index->GetForWardIndex(item.doc_id);
if (doc == nullptr)
continue;
Json::Value elem;
elem["title"] = doc->title;
elem["desc"] = GetDesc(doc->content, item.words[0]); // content是文档的去标签的结果,但是不是我们想要的,我们要的是一部分 TODO
elem["url"] = doc->url;
// for deubg, for delete
elem["id"] = (int)item.doc_id;
elem["weight"] = item.weight; // int->string
root.append(elem);
}
Json::StyledWriter writer;
*json_string = writer.write(root);
}
};
}
搜索:雷军小米 -> 雷军、小米->查倒排->两个倒排拉链(文档1,文档2,文档1、文档2)
安装 jsoncpp
关于调试
#include "cpp-httplib/httplib.h"
#include "searcher.hpp"
const std::string root_path = "./wwwroot";
const std::string input ="data/raw_html/raw.txt";
int main()
{
ns_searcher::Searcher searcher;
searcher.InitSearcher(input);
httplib::Server svr;
svr.set_base_dir(root_path.c_str());
svr.Get("/s", [&searcher](const httplib::Request &req, httplib::Response &rsp){
if(!req.has_param("word"))
{
rsp.set_content("必须要有搜索关键字!","text/plain: chatset=utf-8");
return;
}
std::string word = req.get_param_value("word");
// std::cout<<"用户正在搜索:"<
boost 搜索引擎
makefile
Parser=parser
DUG=debug
HTTP_SEARCHER=http_searcher
cc=g++
.PHONY:all
all:$(Parser) $(DUG) $(HTTP_SEARCHER)
$(Parser):parser.cc
$(cc) -o $@ $^ -lboost_system -lboost_filesystem -std=c++11
$(DUG):debug.cc
$(cc) -o $@ $^ -ljsoncpp -std=c++11
$(HTTP_SEARCHER):http_searcher.cc
$(cc) -o $@ $^ -ljsoncpp -lpthread -std=c++11
.PHONY:clean
clean:
rm -rf $(Parser) $(DUG) $(HTTP_SEARCHER)
#pragma once
#include
#include
#include
#define NORMAL 1
#define WARNING 2
#define DEBUG 3
#define FATAL 4
#define LOG(LEVEL, MESSAGE) log(#LEVEL, MESSAGE, __FILE__, __LINE__)
void log(std::string level, std::string message, std::string file, int line)
{
std::cout << "[" << level << "]" << "[" << time(nullptr) << "]" << "[" << message << "]" << "[" << file << " : " << line << "]" << std::endl;
}