说明:在VS2015 环境的Release 的模式下(Debug模式下运行速度太慢)
TextSimilarity.h
#pragma once
#include
#include
#include
#include
class TextSimilarity
{
public:
typedef std::unordered_map wordFreq;//unordered_map的查找效率较高
typedef std::unordered_set wordSet;
TextSimilarity(std::string dict);
void getStopWordTable(const char* stopWordFile);
wordFreq getWordFreq(const char* file);
std::string UTF8ToGBK(std::string str);
std::string GBKToUTF8(std::string str);
std::vector> sortByValueReverse(wordFreq& wf);
void selectAimWords(std::vector>& wfvec, wordSet& wset);
std::vector getOneHot(wordSet& wset, wordFreq& wf);
double cosine(std::vector oneHot1, std::vector oneHot2);
private:
std::string DICT;
std::string DICT_PATH;
std::string HMM_PATH;
std::string USER_DICT_PATH;
std::string IDF_PATH;
std::string STOP_WORD_PATH;
cppjieba::Jieba _jieba;
wordSet _stopWordSet;
int _maxWordNumber;
};
TextSimilarity.cpp
#define _CRT_SECURE_NO_WARNINGS 1
#include "TextSimilarity.h"
#include
#include
#include "cppjieba/Jieba.hpp"
#include
#include
using namespace std;
//利用jieba分词实现文本相似度功能
//1.对两个文档进行分词
//2.统计文档中的词频
//3.将两个文本中的所有有效次全部编码,排序,去出钱n个关键词
//4.按照码值构建词频向量如:
//文档1中的词频:[0:1,1:1,2:0,3:1]
//文档2中的词频:[0:2,1:1,2:1,3:1]
//文档1词频向量:[1,1,0,1]
//文档2词频向量:[2, 1, 1, 1]
//通过计算向量相似度(欧几里得距离,余弦相似度等等)来评估他们的相似度
//算法缺点:没有很好的解决同义词和多义词问题
TextSimilarity::TextSimilarity(string dict)
:DICT(dict)
, DICT_PATH(dict + "/Jieba.dict.utf8")
,HMM_PATH(dict + "/hmm_model.utf8")
,USER_DICT_PATH(dict + "/user.dict.utf8")
,IDF_PATH(dict + "/idf.utf8")
,STOP_WORD_PATH(dict + "/stop_words.utf8")
,_jieba(DICT_PATH,
HMM_PATH,
USER_DICT_PATH,
IDF_PATH,
STOP_WORD_PATH)
,_maxWordNumber(10)
{
getStopWordTable(STOP_WORD_PATH.c_str());
}
TextSimilarity::wordFreq TextSimilarity::getWordFreq(const char* filename)
{
ifstream fin(filename);
if (!fin.is_open())
{
cout << "open file:" << filename << "failed" << endl;
return wordFreq();
}
string line;
wordFreq wf;
while (!fin.eof())
{
getline(fin, line);//读取一行数据
line = GBKToUTF8(line);//将数据从GBK格式转换为UTF8格式
vector words;
//对文本当前行分词
_jieba.Cut(line, words, true);//调用“结巴”的分词接口
//统计词频
for (const auto& e : words)
{
//去掉停用词
if (_stopWordSet.count(e) > 0)
continue;
else
{
if (wf.count(e) > 0)
wf[e]++;
else
wf[e] = 1;
}
}
}
return wf;
}
void TextSimilarity::getStopWordTable(const char* stopWordFile)
{
ifstream fin(stopWordFile);
if (!fin.is_open())
{
cout << "open file:" << stopWordFile << "failed" << endl;
return;
}
string line;
while (!fin.eof())
{
getline(fin, line);
//UTF8
_stopWordSet.insert(line);//获取停用词
}
fin.close();
}
bool cmpReverse(pair lp, pair rp)
{
return lp.second > rp.second;
}
vector> TextSimilarity::sortByValueReverse(TextSimilarity::wordFreq& wf)
{
vector> wfvector(wf.begin(), wf.end());//因为sort函数只能给有顺序的数据结构排序所以用vector
sort(wfvector.begin(), wfvector.end(), cmpReverse);//第三个参数是函数指针,只写函数名即可
return wfvector;
}
void TextSimilarity::selectAimWords(std::vector>& wfvec, wordSet& wset)
{
int len = wfvec.size();
int sz = len > _maxWordNumber ? _maxWordNumber : len;
for (int i = 0; i < sz; i++)
{
wset.insert(wfvec[i].first);//获得两个文档所给出的词频的并集
}
}
vector TextSimilarity::getOneHot(TextSimilarity::wordSet& wset, TextSimilarity::wordFreq& wf)
{
vector oneHot;
for (const auto& e : wset)
{
if (wf.count(e))
oneHot.push_back(wf[e]);//若词频存在则把该次品放入词频向量中
else
oneHot.push_back(0);//否则将0放入词频向量中
}
return oneHot;
}
double TextSimilarity::cosine(std::vector oneHot1, std::vector oneHot2)
{
//计算余弦相似度(余弦向量角公式)
double modular1 = 0, modular2 = 0;
double products = 0;
assert(oneHot1.size() == oneHot2.size());
for (size_t i = 0; i < oneHot1.size(); i++)
{
products += oneHot1[i] * oneHot2[i];
}
for (size_t i = 0; i < oneHot1.size(); i++)
{
modular1 += pow(oneHot1[i], 2);
modular2 += pow(oneHot2[i], 2);
}
return products / (pow(modular1, 0.5) * pow(modular2, 0.5));
}
string TextSimilarity::GBKToUTF8(const string str)
{
int len = MultiByteToWideChar(CP_ACP, 0, str.c_str(), -1, NULL, 0);
//参数1:转换为UTF16的字符编码格式
//参数2:转换类型标记,对于UTF8或者GBK而言,此值要设为0或者MB_ERR_INVALID_CHARS
//参数3:要转换的字符串指针
//参数4.要转换的字节大小,如果设为-1,则处理整个字符串,包括结束字符,返回值也包括
//参数5.保存转换之后的字符串buffer(URF-16)
//参数6:参数五的buffer大小。如果此值为0,函数返回buffer所要求的大小,包括结束字符
//返回值:写入到参数五的buffer字符数量
wchar_t* wstr = new wchar_t[len];
MultiByteToWideChar(CP_ACP, 0, str.c_str(), -1, wstr, len);
len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
//UTF16转换为UTF8
//参数7、8:默认检查,一般设为NULL
char* utf8char = new char[len];
WideCharToMultiByte(CP_UTF8, 0, wstr, -1, utf8char, len, NULL, NULL);
string temp = utf8char;
if (wstr)
{
delete[] wstr;
wstr = NULL;
}
if (utf8char)
{
delete[] utf8char;
utf8char = NULL;
}
return temp;
}
string TextSimilarity::UTF8ToGBK(const string str)
{
int len = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, NULL, 0);
wchar_t* wstr = new wchar_t[len];
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, wstr, len);
len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
char* gbkchar = new char[len];
WideCharToMultiByte(CP_ACP, 0, wstr, -1, gbkchar, len, NULL, NULL);
string temp = gbkchar;
if (wstr)
{
delete[] wstr;
wstr = NULL;
}
if (gbkchar)
{
delete[] gbkchar;
gbkchar = NULL;
}
return temp;
}
test.cpp
#define _CRT_SECURE_NO_WARNINGS 1
#include "TextSimilarity.h"
#include
using namespace std;
void testTextSimilarity()
{
TextSimilarity ts("dict");
TextSimilarity::wordFreq wf = ts.getWordFreq("test.txt");
TextSimilarity::wordFreq wf2 = ts.getWordFreq("test2.txt");
vector> wfvec = ts.sortByValueReverse(wf);
vector> wfvec2 = ts.sortByValueReverse(wf2);
cout << "wfvec:" << endl;
for (int i = 0; i < 10; i++)
{
//将字符编码格式从UTF8转到GBK进行打印
cout << ts.UTF8ToGBK(wfvec[i].first) << ":" << wfvec[i].second << " ";
}
cout << endl;
cout << "wfvec2:" << endl;
for (int i = 0; i < 10; i++)
{
cout << ts.UTF8ToGBK(wfvec2[i].first) << ":" << wfvec2[i].second << " ";
}
cout << endl;
TextSimilarity::wordSet wset;
ts.selectAimWords(wfvec, wset);
ts.selectAimWords(wfvec2, wset);
cout << "wset" << endl;
for (const auto& e : wset)
{
cout << ts.UTF8ToGBK(e) << " ";
}
cout << endl;
vector oneHot = ts.getOneHot(wset, wf);
vector oneHot2 = ts.getOneHot(wset, wf2);
cout << "oneHot:" << endl;
for (const auto& v : oneHot)
{
cout << v << " ";
}
cout << endl;
cout << "oneHot2:" << endl;
for (const auto& v : oneHot2)
{
cout << v << " ";
}
cout << endl;
double db = 0;
db = ts.cosine(oneHot, oneHot2);
cout << "文档相似度为:"<
