瞎搞一次C++推荐引擎实践之基于物品的协同过滤

首先,需要使用到 movielens的 数据集(这是网址),我选的ml-latest-small数据,由于对vector的使用还不是很熟练,总是会莫名其妙出现下标越界提示,程序最终没有调试成功(完整数据计算一次大概需要近3个小时,orz,求大佬帮优化)

下面是代码:

#include
#include
#include
#include
#include
#include 
#include
#include
using namespace std;

struct movie //将movies.csv的信息处理后,存储到这里
{
	int id;//存放电影ID
	string name;//存放电影名称
	vector tags;//存放电影标签
	vector sames;//计算这部电影的相似电影id后存到这里待用

};

struct users //这里主要存储ratings.csv的信息
{
	int id;//存储用户ID
	vector movies;//用户看过的电影
	vector predictmovies;//通过基于电影本身的相似度向用户推荐的电影
	double prediction = 0;//计算对当前用户的预测准确率
	double recall = 0;//计算对当前用户的预测召回率
};

vector split(string s, char token) //用于读取csv文件的函数,主要用途是分离特定的符号
{
	stringstream iss(s);
	string word;
	vector vs;
	while (getline(iss, word, token)) {
		vs.push_back(word);
	}
	return vs;
}

double yuxuan(vector v1, vector v2)//计算两个向量的余弦相似度
{

	vector v;
	sort(v1.begin(), v1.end());
	sort(v2.begin(), v2.end());
	set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), back_inserter(v));//求交集 

	double j1 = v.size();
	double j2 = v1.size();
	double j3 = v2.size();
	double jieguo = j1 / sqrt(j2 * j3);
	return jieguo;

}

double predict_ratio(vector v1, vector v2)//计算预测准确率的函数
{

	vector vj;
	sort(v1.begin(), v1.end());
	sort(v2.begin(), v2.end());
	set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), back_inserter(vj));//求交集 

	double j1 = vj.size();
	double j2 = v1.size();
	double jieguo = j1 / j2;
	return jieguo;

}

double recall_ratio(vector v1, vector v2)//计算预测召回率的函数
{

	vector vj;
	sort(v1.begin(), v1.end());
	sort(v2.begin(), v2.end());
	set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), back_inserter(vj));//求交集 

	double j1 = vj.size();
	double j2 = v2.size();
	double jieguo = j1 / j2;
	return jieguo;

}

int main()
{
	string line;
	unsigned hang = 0;
	ifstream movies("movies.csv");//读取文件前,我已经将文件的表头删掉了,下面用到的ratings.csv同样如此。
	if (!movies.is_open())
	{
		cout << "文件打开失败" << endl;
		exit(1);
	}
	vector mymovies;//先通过mymovies向量将csv的数据每一行取出来
	while (movies.good())
	{
		getline(movies, line);
		hang++;
		mymovies.push_back(line);
	}
	cout << hang << endl;//打印行数,无特殊目的
	movies.close();
	//for (unsigned i = 0; i < mymovies.size(); i++)
	//{
	//	cout << mymovies[i] << endl;
	//}
	int len;
	len = mymovies.size() + 1000;
	vector onemovie(len);
	for (unsigned i = 0; i < hang - 1; i++)//将文件信息转存到数组中
	{
		vector p(3);
		p = split(mymovies[i], ',');
		onemovie[i].id = atoi(p[0].c_str());
		onemovie[i].name = p[1];
		onemovie[i].tags = split(p[2], '|');
	}
	//for (unsigned i = 0; i < len; i++)//此处用于打印存储效果,用于调试
	//{
	//	if(onemovie[i].id != NULL)
	//	{ 
	//		int j = 0;
	//		cout << onemovie[i].id << "\t" << onemovie[i].name << "\t";
	//		for ( j = 0; j < onemovie[i].tags.size(); j++)
	//		{
	//			cout << onemovie[i].tags[j]<<" ";
	//		}
	//		cout << endl;
	//	}
	//	else
	//	{
	//		break;
	//	}
	//}
	for (int j = 0; j < onemovie.size() - 5000; j++)//计算电影与电影的相似度,由于数据比较多,这里耗时很长,-5000的目的是防止下标越界,同时减少此次模拟推荐引擎的计算时间,大佬可能有别的办法
	{
		for (int i = 0; i < onemovie.size() - 5000; i++)
		{
			if (i == j)
			{

			}
			else
			{
				double same = yuxuan(onemovie[j].tags, onemovie[i].tags);
				if (same > 0.8)
				{
					/*cout << "movie id=" << j << "的电影与movie id=" << i << "的电影相似度为" << same << endl;*/
					onemovie[j].sames.push_back(onemovie[i].id);
				}
			}
		}
		/*cout << "movie id=" << j << "的相似movie id 为";
		for (int k = 0; k < onemovie[j].sames.size(); k++)
		{
			cout<< onemovie[j].sames[k] << "  ";
		}
		cout << endl;*/

	}

	ifstream user("ratings.csv");
	if (!user.is_open())
	{
		cout << "文件打开失败" << endl;
		exit(1);
	}

	vector myuser;
	while (user.good())
	{
		getline(user, line);
		myuser.push_back(line);
	}
	int lens;
	lens = myuser.size() + 10;
	vector oneuser(lens);
	vector p1;
	for (int i = 0; i < myuser.size() - 20; i++)
	{
		vector p(3);
		p = split(myuser[i], ',');
		oneuser[i].id = atoi(p[0].c_str());
		oneuser[i].movies.push_back(atoi(p[1].c_str()));//只用到了ratings文件的前两列数据。
	}
	for (int s = 0; s < oneuser.size() - 100; s++)
	{
		for (int j = 0; j < onemovie.size() - 100; j++)
		{
			if (oneuser[s].movies[0] == onemovie[j].id)
			{
				cout << oneuser[s].movies[0] << endl;
				cout << onemovie[j].id << endl;

				oneuser[s].predictmovies.assign(onemovie[j].sames.begin(), onemovie[j].sames.end());//根据用户看过的电影,把符合相似度要求的电影合并到一起,但此处未去重,可能影响推荐效果

				for (int k = 0; k < onemovie[s].sames.size()-1; k++)
				{
					cout << onemovie[j].sames[k] << endl;
				}
				for (int k = 0; k < oneuser[s].predictmovies.size()-1; k++)
				{
					cout << oneuser[s].predictmovies[k] << endl;
				}
				oneuser.erase(oneuser.begin() + j);
			}
			else
			{
				break;
			}
		}
	}
	for (int s = 0; s < 5; s++)//由于时间问题这里只输出前五个用户的计算结果
	{
		for (int j = 0; j < oneuser.size() - 10000; j++)
		{
			if (oneuser[s].id == oneuser[j].id && j != s)
			{
				oneuser[s].movies.assign(oneuser[j].movies.begin(), oneuser[j].movies.end());
				oneuser[s].predictmovies.assign(oneuser[j].predictmovies.begin(), oneuser[j].predictmovies.end());
				for (int k = 0; k < oneuser[s].predictmovies.size() - 1; k++)
				{
					cout << oneuser[s].predictmovies[k] << endl;
				}
				oneuser.erase(oneuser.begin() + j);
			}
		}
	}
	for (int s = 0; s < 5; s++)
	{
		oneuser[s].prediction = predict_ratio(oneuser[s].predictmovies, oneuser[s].movies);
		oneuser[s].recall = recall_ratio(oneuser[s].predictmovies, oneuser[s].movies);
		cout << "对用户预测的准确率为:" << oneuser[s].prediction << "\t" << "召回率为:" << oneuser[s].recall << endl;
	}

	system("pause");
	return 0;
}

 

你可能感兴趣的:(C++学习,推荐系统)