MapReduce程序实现
1)Mean.java
package mr;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 该MapReduce用来计算每部电影的平均评分
*/
public class Mean {
public static class FirstMapper extends Mapper
String [] dataset = new String[4];
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
//value:1,2,3.5,1112486027
dataset = value.toString().split(",");
Text mkey = new Text();
Text mvalue = new Text();
//过滤掉标题行
if("movieId".equals(dataset[1]))
return;
mkey.set(dataset[1]);
mvalue.set(dataset[2]);
//key:movieId,value:rating
context.write(mkey, mvalue);
}
}
public static class FirstReducer extends Reducer
{
Text mean=new Text();
public void reduce(Text key, Iterable
values,Context context) throws IOException, InterruptedException {
float rate = 0.0f;
int i=0;
for(Text text : values){
try{
//把同一部电影的所有评分加起来
rate += Float.parseFloat(text.toString());
}catch(Exception e){
return;
}
//对每一部电影,统计其有多少个评分
i++;
}
mean.set((rate/i)+"");
//key:movieId,value:电影的平均评分
context.write(key,mean);
}
}
}
2)Regular.java
package mr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/*
* 该MapReduce用来对数据进行去中心化,即用每一行记录的评分减去该行对应电影的平均分
*/
public class Regular {
public static class FirstMapper extends Mapper
String [] dataset = new String[4];
Text movieId = new Text();
Text ratingMean = new Text();
Text mkey = new Text();
Text mvalue = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
InputSplit inputSplit = context.getInputSplit();
//求出当前读取的输入文件的文件名
String strname = ((FileSplit) inputSplit).getPath().getName();
//若文件名是part-r-00000,则当前读取的输入文件是电影平均评分的文件
if("part-r-00000".equals(strname)){
StringTokenizer token = new StringTokenizer(value.toString());
//value:movieId,rating(平均评分)
if(token.hasMoreElements()){
movieId.set(token.nextToken());
if(token.hasMoreElements()){
ratingMean.set("#"+token.nextToken());
}
}
//key:movieId,#rating(平均评分)
context.write(movieId, (ratingMean));
//若文件名不是part-r-00000,则当前读取的输入文件是数据集的文件
}else{
//value:1,2,3.5,1112486027
dataset = value.toString().split(",");
//过滤掉标题行
if("movieId".equals(dataset[1]))
return;
mkey.set(dataset[1]);
mvalue.set((dataset[0]+","+dataset[2]));
//key:movieId,value:userId,rating
context.write(mkey, mvalue);
}
}
}
public static class FirstReducer extends Reducer
{
float mean = 0.0f;
float rating = 0.0f;
//该reduce函数,对每个key(movieId),迭代器Iterable包含该movieId对应的所有userId和movieId对应的平均评分
public void reduce(Text key, Iterable
values,Context context)
throws IOException, InterruptedException {
List
list = new ArrayList (); for(Text text : values){
//求出电影的平均评分(以#开头)
if(text.toString().startsWith("#")){
mean = Float.parseFloat(text.toString().substring(1));
continue;
}
//把movieId对应的所有用户userId及其评分rating保存到list集合
list.add(text.toString());
}
//遍历这个list集合,对每个元素(userId,rating),用rating-mean求出每个userId的评分去中心化后的评分
for(String str : list){
rating = Float.parseFloat(str.substring(str.indexOf(",")+1))-mean;
str = ","+str.substring(0, str.indexOf(","))+","+rating;
//key:movieId,value:,userId,rating(去中心化后的评分,在value前面加逗号,是便于后面的分割处理)
context.write(key, new Text(str));
}
}
}
}
3)ExtractTestData.java
package mr;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 该MapReduce用来从数据集中抽取出119条记录作为测试集
*/
public class ExtractTestData {
public static class FirstMapper extends Mapper
String [] dataset = new String[4];
static int count = 1;
static int location = 1;
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
//value:1,2,3.5,1112486027
dataset = value.toString().split(",");
Text mkey = new Text();
Text mvalue = new Text();
//过滤掉标题行
if("movieId".equals(dataset[1]))
return;
//若当前读取到的记录的userId=location,且统计记录数的count小于119,则把该记录传到reduce
if(dataset[0].trim().equals(location+"")&&count<=119){
mkey.set(dataset[1]);
mvalue.set(dataset[0]+","+dataset[2]);
//location的增量
location = location+count;
//统计记录数加1
count++;
//key:movieId,value:userId,rating
context.write(mkey, mvalue);
}else
return;
}
}
public static class FirstReducer extends Reducer
{
public void reduce(Text key, Iterable
values,Context context)
throws IOException, InterruptedException {
//遍历这个迭代器Iterable,输出每一个元素(userId,rating)
for(Text text: values){
text.set(","+text.toString());
//key:movieId,value:,userId,rating(value前面加逗号,是便于后面的分割处理)
context.write(key,text);
}
}
}
}
/*
//通过这个reduce处理,抽取到的测试集更均匀,但计算量更大
public static class FirstReducer extends Reducer
{
Text mkey = new Text();
Text mvalue = new Text();
int count = 1;
public void reduce(Text key, Iterable
values,Context context)
throws IOException, InterruptedException {
List
list = new ArrayList (); String userId = key.toString();
for(Text text: values){
if(list.size()>10)
break;
list.add(text.toString());
}
if(count<=263){//if((set.size()==userlocation||list.size()==50)&&count<=263){
int size = list.size();
Random rand = new Random();
boolean[] bool = new boolean[size];
int randInt = 0;
for(int i = 0; i < 3 ; i++) {
do {
randInt = rand.nextInt(size);
}while(bool[randInt]);
bool[randInt] = true;
String [] dataset = list.get(randInt).split(",");
mkey.set(dataset[0]);
mvalue.set(","+userId+","+dataset[1]);
context.write(mkey, mvalue);
count++;
}
}else
return;
}
}*/
4)MovieSimilar.java
package mr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 该MapReduce用余弦值作为相似度计算出某部电影与其他所有电影的相似度
*/
public class MovieSimilar {
public static class FirstMapper extends Mapper
String [] dataset = new String[3];
Text mkey = new Text();
Text mvalue = new Text();
int count = 1;
List
list = new ArrayList (); Set
set = new HashSet (); public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
//获得从main函数传进来的movieId
String movieId = context.getConfiguration().get("movieId");
//value:movieId,userId,rating(去中心化后的评分)
dataset = value.toString().split(",");
//得到数据集的movieId
String dmId = dataset[0].replaceAll("\t|\\s+","").trim();
if(movieId.equals(dmId)){
//把从main函数传进来的要计算其相似度的movieId与去中心化后
//的数据集中的movieId相等的记录保存到list集合中
//如movieId=1,则list集合保存的是数据集中的movieId=1的所有记录
//list(0):1,2,-2.8733,list(1):1,4,-1.2344....
list.add(value.toString());
}else{
//set集合保存除从main函数传进来的要计算其相似度的movieId之外的其它所有的movieId
set.add(dmId);
mkey.set("<"+dmId+","+movieId+">");
mvalue.set(value.toString());
//key:
除了要计算的 movieId之外的数据集中的movieId),movieId(要计算的movieId)>,//value:movieId(除了要计算的movieId之外),userId,rating(去中心化后的评分)
//如<2,1> 2,3,2.542
context.write(mkey, mvalue);
}
//若读取到第14206行(总的电影数为14206,即最后一行,不同的数据集有不同的数值)
if(count==14206){
//遍历set集合
for(String i:set){
//遍历list集合
for(String str:list){
String skey = "<"+i+","+movieId+">";
mkey.set(skey);
mvalue.set(str);
//key:集合中的movieId),movieId(要计算的movieId)>
//value:movieId(要计算的movieId),userId,rating(去中心化后的评分)
//如<2,1> 1,3,1.342
context.write(mkey, mvalue);
}
}
}
//统计当前读取到第几行
count++;
}
}
public static class FirstReducer extends Reducer
{
float mean = 0.0f;
float rating = 0.0f;
//key的值是要计算的某一对movieId,values保存了key这对movieId的所有用户及其评分
//如key:<2,1> values:1,3,1.342;2,3,2.542(values中每个元素的第一个值用以区别当前记录是属于哪个movieId的)
public void reduce(Text key, Iterable
values,Context context)
throws IOException, InterruptedException {
List
list1 = new ArrayList (); Set
list2 = new HashSet (); //求出
对的第一个值,如<2,1>中的2 String keyn = key.toString().substring(1, key.toString().indexOf(",")).trim();
for(Text text : values){
String str = text.toString();
//求出values中每个元素的第一个值movieId
String movieId = str.substring(0, str.indexOf(",")).trim().replace("\t","");
//若从values中求出的movieId和从key求出的movieId相同则保存到list1集合中,否则保存到list2集合中
if(keyn.equals(movieId.trim())){
//list1集合保存的是所有的movieId等于
对第一个movieId值的记录 list1.add(str.substring(str.indexOf(",")+1));
}else{
//list2集合保存的是所有的movieId等于
对第二个movieId值的记录 list2.add(str.substring(str.indexOf(",")+1));
}
}
float count = 0.0f,rating1=0.0f,rating2=0.0f;
//下面是求余弦值相似度
//遍历list1集合
for(String str1:list1){
//str1:userId,rating
String[] pair1 = str1.split(",");
//把list1集合中的所有评分的平方累加起来
rating1+=Math.pow(Float.parseFloat(pair1[1].trim()), 2);
for(String str2:list2){
//str2:userId,rating
String[] pair2 = str2.split(",");
//list1集合中的userId和list2集合中的userId相同,即该用户同时对这两部电影都给于了评分
if(pair1[0].equals(pair2[0])){
//把该用户对这两部电影的评分乘积累加起来
count+=(Float.parseFloat(pair1[1].trim())*Float.parseFloat(pair2[1].trim()));
break;
}
}
}
//遍历list2集合
for(String str2:list2){
String[] pair2 = str2.split(",");
//把list2集合中的所有评分的平方累加起来
rating2+=Math.pow(Float.parseFloat(pair2[1].trim()), 2);
}
String similar = "0";
//分母不为零
if(rating1!=0.0&&rating2!=0.0){
//求出余弦值相似度
similar = count/(Math.sqrt(rating1)*Math.sqrt(rating2))+"";
}
//key:
除了要计算的 movieId之外的数据集中的movieId),movieId(要计算的movieId)>//value:similar(余弦值相似度)
context.write(key,new Text(similar));
}
}
}
5)MostSimilarMovies.java
package mr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 该MapReduce用来求出电影相似度最高的20个记录
*/
public class MostSimilarMovies {
public static class FirstMapper extends Mapper
Text mkey = new Text();
Text mvalue = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
mkey.set("0");
//key:0,value:
cosine(电影间的余弦值相似度) context.write(mkey,value);
}
}
public static class FirstReducer extends Reducer
{
int count = 1;
Text tk = new Text();
Text tv = new Text();
//values保存了14025对电影间的相似度记录
public void reduce(Text key, Iterable
values,Context context)
throws IOException, InterruptedException {
List
list = new ArrayList (); //建立的数组长度为数据集中电影的总数减一
String[] strs = new String[14025];
int i = 0;
//遍历values,将其信息保存到数组中,便于排序
for(Text text:values){
StringTokenizer token = new StringTokenizer(text.toString());
String mkey = token.nextToken().trim();//
String str = token.nextToken().trim();//cosine(电影间的余弦值相似度)
if(str.contains("E")){
//若表示成科学计数法则将其转换成小数的形式,保存到数组中的数据形式为cosine
strs[i] = getdecimal(str)+"\t"+mkey;
}else{
strs[i] = str+"\t"+mkey;
}
i++;
}
//对cosine
组成的字符串排序,得到升序数组 Arrays.sort(strs);
for(String s :strs){
//按顺序保存到list集合
list.add(s);
}
//对顺序保存的list集合做翻转操作,得到其倒序集合
Collections.reverse(list);
for(int k=0;k
//输出前20个倒序后的记录
if(k>=20)
break;
String[] output = list.get(k).split("\t");
tk.set(output[0]);
tv.set(output[1]);
//key:cosine,value:
context.write(tk,tv);
}
}
}
public static String getdecimal(String value){
int location = Integer.parseInt(value.substring(value.length()-1));
String str = "0.";
for(int i = 1;i
str+="0";
}
str += value.substring(0,10).replace(".", "");
return str;
}
}
6)PredictMovieRating.java
package mr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 该MapReduce根据20个最高的电影相似度数据预测用户对电影的评分
*/
public class PredictMovieRating {
public static class FirstMapper extends Mapper
String [] dataset = new String[4];
Text mkey = new Text();
Text mvalue = new Text();
List
list = new ArrayList (); List
userIdlist = new ArrayList (); int i = 0;
String tmovieId = "";
float[] f1,f2;
int total = 0;
int maxuserId = 0;
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
//算一次即可
if(i==0){
//获取从main函数传进来的match参数,并以制表符分割,形式为movieId,userId,userId..._与
//前面movieId最相似的20个movieId及其余弦值,如1,1,3,7_2 0.7362\t4 0.634...
String[] info = context.getConfiguration().get("match").split("\t");
//求出要预测的movieId和其对应的userId
String[] users = info[0].split("_")[0].split(",");
//通过users[0]求出要预测的tmovieId
tmovieId = users[0].trim();
//把第一个userId赋予maxuserId
maxuserId = Integer.parseInt(users[1]);
int userid = 0;
for(int k=1;k
//userIdlist集合保存要预测的userId
userIdlist.add(users[k].trim());
userid = Integer.parseInt(users[k]);
//求出这些userId中的最大值maxuserId
if(maxuserId
maxuserId = userid;
}
for(int j=0;j
if(j==0){
//list集合保存与tmovieId最相似的20个movieId及其余弦值,j=0,要去掉下划线_前面的信息
list.add(info[0].substring(info[0].indexOf("_")+1));
}else{
list.add(info[j]);
}
}
//初始化数组f1和f2,用以后面用户对电影的评分计算
total = userIdlist.size();
f1 = new float[total];
f2 = new float[total];
for(int i=0;i
f1[i]=0.0f;
f2[i]=0.0f;
}
//防止上面步骤每读入一条记录都要执行一次
i=1;
}
//value:userId,movieId,rating,timestamp,如1,2,3.5,1112486027
dataset = value.toString().split(",");
//求出userId
String userId = dataset[0].trim();
//过滤掉标题行
if(userId.equals("userId"))
return;
//求出movieId
String movieId = dataset[1].trim();
//若读取的记录的userId不比要预测的maxuserId大,则算出用户对电影的预测值所需的f1(分子)和f2(分母)
if(Integer.parseInt(userId)<=maxuserId){
//对userIdlist集合中的每个用户,计算其对tmovieId的评分
for(int i=0;i
//若读取的记录的userId等于userIdlist集合中的某个值,则计算该用户对tmovieId的评分
if(userId.equals(userIdlist.get(i).trim())){
//遍历list集合
for(String str:list){
String[] moviesimilarinfo = str.split(",");
//若读取的记录的movieId等于list集合中的某个值,则把其相似度乘以该用户对movieId的评分
//并加到f1数组的第i个元素中,同时把相似度加到f2数组的第i个元素中
if(moviesimilarinfo[0].trim().equals(movieId)){
f1[i]+=(Float.parseFloat(dataset[2].trim())*Float.parseFloat(moviesimilarinfo[1].trim()));
f2[i]+=Float.parseFloat(moviesimilarinfo[1].trim());
//每次处理一条数据,若找到,处理后立即退出循环
break;
}}
break;
}
}
//读取的记录的userId比要预测的maxuserId大,说明用户对电影的预测值所需的f1和f2已经算好
}else{
float rating=0.0f;
//对userIdlist集合中的每个用户,计算其对tmovieId的评分
for(int j=0;j
mkey.set(tmovieId+","+userIdlist.get(j));
if(f2[j]!=0){
rating = (f1[j]/f2[j]);
}
mvalue.set(","+rating);
//key:tmovieId,userId,value:,rating(电影的预测值,value前面加逗号,是便于后面的分割处理)
context.write(mkey, mvalue);
}
}
}
}
public static class FirstReducer extends Reducer
{
public void reduce(Text key, Iterable
values,Context context)
throws IOException, InterruptedException {
//key:tmovieId,userId,values只保存一个值:,rating
context.write(key, values.iterator().next());
}
}
}
7)CalculateDifference.java
package mr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 该MapReduce用来计算评分的实际值与预测值的差异
*/
public class CalculateDifference {
public static class FirstMapper extends Mapper
Text mkey = new Text();
Text mvalue = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
//value:movieId,userId,rating(包括实际值和预测值)
String[] info = value.toString().split(",");
mkey.set(info[0].trim()+","+info[1].replaceAll("\t|\\s+", "").trim());
mvalue.set(info[2].trim());
//key:movieId,userId,value:rating
context.write(mkey, mvalue);
}
}
public static class FirstReducer extends Reducer
{
Text text = new Text();
float rawrating = 0.0f,predictrating = 0.0f;
public void reduce(Text key, Iterable
values,Context context) throws IOException, InterruptedException {
List
list = new ArrayList (); //把相同的rating包括实际值和预测值存到list集合中
for(Text text:values){
list.add(text.toString());
}
//假设第一个是实际值,第二个值预测值
rawrating = Float.parseFloat(list.get(0));
predictrating = Float.parseFloat(list.get(1));
//求出实际值与预测值的差值平方
double diff = Math.pow((rawrating-predictrating),2);
//若预测值为0,则在实际值与预测值的差值平方后面加上(predictrating=0.0),用以区别该预测值为零
//是由于与这部电影最相似的20部电影在指定的用户下都没有评分记录导致的
if(rawrating==0.0||predictrating==0.0){
text.set(diff+" (predictrating=0.0)");
}else{
text.set(diff+"");
}
//key:movieId,userId,value:diff(实际值与预测值的差值平方)或在后面加上(predictrating=0.0)
context.write(key, text);
}
}
8) CalculateRMSE.java}
package mr;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*
* 该MapReduce用来求预测电影的均方根误差RMSE
*/
public class CalculateRMSE {
public static class FirstMapper extends Mapper
Text mkey = new Text();
Text mvalue = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
//value:movieId,userId diff(实际值与值预测值差值的平方)或在后面加上 (predictrating=0.0)
StringTokenizer token = new StringTokenizer(value.toString());
//去掉含有(predictrating=0.0)的记录,即去掉预测值为零的记录
if(token.countTokens()>=3)
return;
if(token.hasMoreElements()){
token.nextToken();
mkey.set("diff");
if(token.hasMoreElements()){
mvalue.set(token.nextToken());
//key:diff,value:diff(实际值与预测值的差值平方)
context.write(mkey, mvalue);
}
}
}
}
public static class FirstReducer extends Reducer
{
Text text = new Text();
public void reduce(Text key, Iterable
values,Context context)
throws IOException, InterruptedException {
double rmse = 0.0d;
int count = 0;
for(Text text:values){
//遍历values集合,把所有的diff(电影实际值与预测值的差值平方)加到rmse中
rmse+=Double.parseDouble(text.toString().trim());
count++;
}
key.set("RMSE");
text.set(Math.sqrt(rmse/count)+"");
//key:RMSE,value:rmse值
context.write(key, text);
}
}
public static void main(String[] args) throws Exception {
String path = "";
//数据集的文件路径
String dataset = path+"/wsd/input/rating.txt";
//电影的平均评分路径
String mean = path+"/wsd/CF/mean";
//数据去中心化的路径
String regular = path+"/wsd/CF/regular";
//抽取一部分数据集作为测试集的路径
String extracttestdata = path+"/wsd/CF/extracttestdata";
//电影相似度的路径
String moviesimilar = path+"/wsd/CF/moviesimilar/";
//最高的电影相似度的路径
String mostsimilarmovies = path+"/wsd/CF/mostsimilarmovies/";
//用户对电影的评分的路径
String predictmovierating = path+"/wsd/CF/predictmovierating/";
//用户对电影的实际值与预测值的差值平方的路径
String difference = path+"/wsd/CF/difference";
//预测电影的均方根误差RMSE的路径
String rmse = path+"/wsd/CF/rmse";
Path dstPath;
FileSystem dhfs=null;
Configuration conf = new Configuration();
//求所有电影的平均评分
System.out.println("----------------------------mean--------------------------------");
Job Jmean = new Job(conf, "Mean");
Jmean.setJarByClass(Mean.class);
Jmean.setMapperClass(Mean.FirstMapper.class);
Jmean.setCombinerClass(Mean.FirstReducer.class);
Jmean.setReducerClass(Mean.FirstReducer.class);
Jmean.setOutputKeyClass(Text.class);
Jmean.setOutputValueClass(Text.class);
//输入为数据集的路径
FileInputFormat.addInputPath(Jmean, new Path(dataset));
//输出为所有电影的平均评分的路径
FileOutputFormat.setOutputPath(Jmean, new Path(mean));
Jmean.waitForCompletion(true);
//求数据去中心化
System.out.println("----------------------------regular-----------------------------");
Job Regular = new Job(conf, "Regular");
Regular.setJarByClass(Regular.class);
Regular.setMapperClass(Regular.FirstMapper.class);
Regular.setReducerClass(Regular.FirstReducer.class);
Regular.setOutputKeyClass(Text.class);
Regular.setOutputValueClass(Text.class);
//输入为数据集的路径
FileInputFormat.addInputPath(Regular, new Path(dataset));
//输入为所有电影的平均评分的路径
FileInputFormat.addInputPath(Regular, new Path(mean+"/part-r-00000"));
//输出为数据去中心化路径
FileOutputFormat.setOutputPath(Regular, new Path(regular));
Regular.waitForCompletion(true);
//抽取一部分数据集作为测试集
System.out.println("--------------------ExtractTestData--------------------");
Job ExtractTestData = new Job(conf, "ExtractTestData");
ExtractTestData.setJarByClass(ExtractTestData.class);
ExtractTestData.setMapperClass(ExtractTestData.FirstMapper.class);
ExtractTestData.setReducerClass(ExtractTestData.FirstReducer.class);
ExtractTestData.setOutputKeyClass(Text.class);
ExtractTestData.setOutputValueClass(Text.class);
//输入为数据集的路径
FileInputFormat.addInputPath(ExtractTestData, new Path(dataset));
//输出为抽取到的测试集路径
FileOutputFormat.setOutputPath(ExtractTestData, new Path(extracttestdata));
ExtractTestData.waitForCompletion(true);
//通过getfileinfo函数读取抽取到的测试集中的记录,形式:1,3,3.5\n1,6,4.0...(movieId,userId,rating)
String movieinfo = getfileinfo(extracttestdata+"/part-r-00000");
String[] testdata = movieinfo.split("\n");
List
list = new ArrayList ();
String users = "";
String movieId = "";
//通过这个for循环处理后,list集合保存测试集中movieId及该movieId对应的所有userId
//如测试集有1,3,3.5;1,6,4.0;2,1,4.0;2,3,4.0;2,6,4.0...则list(0):1,3,6;list(1):2,1,3,6...
//第一个为movieId,后面的都是该movieId对应的所有userId
for(int j=0;j
//testdata[j]:1,3,3.5(movieId,userId,rating)
String[] info = testdata[j].split(",");
if(j==0){
//对第一条记录,保存其movieId,并将userId添加到users后面,以逗号分开
movieId = info[0].trim();
users+=info[1]+",";
}else{
//对于后面的记录,若其movieId等于前面的movieId,则继续把userId添加到users后面,以逗号分开
if(movieId.equals(info[0].trim())){
users+=info[1]+",";
}else{
//对于后面的记录,若其movieId与前面的movieId不相等
//则去掉最后面的逗号,并在其前面加上前面保存的movieId,加到list集合中
users = users.substring(0,users.length()-1);
list.add(movieId+","+users);
//users被赋予当前记录的userId
users=info[1]+",";
//movieId被赋予当前记录的movieId
movieId = info[0].trim();
}
//读取到testdata数组的最后一个元素
if(j==testdata.length-1){
//则去掉最后面的逗号,并在其前面加上当前记录的movieId,加到list集合中
users = users.substring(0,users.length()-1);
list.add(info[0].trim()+","+users);
}
}
}
//遍历list集合,求出测试集中所有记录的电影评分
for(String s:list){
//list集合中每个元素的第一个值为movieId
movieId = s.substring(0, s.indexOf(",")).replace("\t|\\s+", "");
//求出电影间的相似度
System.out.println("-----------当前要计算的movieId:--"+movieId+"--------------------");
//把当前要计算的movieId通过configration传进去
conf.set("movieId", movieId);
System.out.println("--当前要计算的movieId:--"+movieId+"---MovieSimilar--------------------");
Job Jmoviesimilar = new Job(conf, "MovieSimilar");
Jmoviesimilar.setJarByClass(MovieSimilar.class);
Jmoviesimilar.setMapperClass(MovieSimilar.FirstMapper.class);
Jmoviesimilar.setReducerClass(MovieSimilar.FirstReducer.class);
Jmoviesimilar.setOutputKeyClass(Text.class);
Jmoviesimilar.setOutputValueClass(Text.class);
//输入为数据去中心化的路径
FileInputFormat.addInputPath(Jmoviesimilar,new Path(regular+"/part-r-00000"));
//输出为当前电影movieId相似度的路径
FileOutputFormat.setOutputPath(Jmoviesimilar, new Path(moviesimilar+movieId));
Jmoviesimilar.waitForCompletion(true);
//求出最高的电影相似度
System.out.println("--当前要计算的movieId:--"+movieId+"----MostSimilarMovies--------------------");
Job Jmostsimilarmovie = new Job(conf, "MostSimilarMovies");
Jmostsimilarmovie.setJarByClass(MostSimilarMovies.class);
Jmostsimilarmovie.setMapperClass(MostSimilarMovies.FirstMapper.class);
Jmostsimilarmovie.setReducerClass(MostSimilarMovies.FirstReducer.class);
Jmostsimilarmovie.setOutputKeyClass(Text.class);
Jmostsimilarmovie.setOutputValueClass(Text.class);
//输入为当前电影movieId相似度的路径
FileInputFormat.addInputPath(Jmostsimilarmovie,new Path(moviesimilar+movieId+"/part-r-00000"));
//输出为当前电影movieId最高相似度的路径
FileOutputFormat.setOutputPath(Jmostsimilarmovie, new Path(mostsimilarmovies+movieId));
Jmostsimilarmovie.waitForCompletion(true);
//通过getfileinfo函数读取当前电影movieId最高相似度的记录,形式:0.872 <2,1>\n0.737 <2,1>...(cosine
)
String mostsimilar = getfileinfo(mostsimilarmovies+movieId+"/part-r-00000");
String[] mostmovieinfo = mostsimilar.split("\n");
String match = s+"_";
for(String str:mostmovieinfo){
StringTokenizer token = new StringTokenizer(str);
String cosine = token.nextToken();
String movie = token.nextToken();
movie = movie.substring(1, movie.indexOf(","));
//对mostmovieinfo数组中的每一个元素,把当前要计算的movieId与其最相似movieId和相似度添加到match后面
match+=(movie+","+cosine+"\t");
}
//通过上面的处理,match保存的是list集合的元素_与当前要计算的movieId最相似的movieId和他们的相似度
conf.set("match", match);
System.out.println(match);
//求出预测的电影评分
System.out.println("--当前要计算的movieId--"+movieId+"-----PredictMovieRating--------------------");
Job Jpredictmovierating = new Job(conf, "PredictMovieRating");
Jpredictmovierating.setJarByClass(PredictMovieRating.class);
Jpredictmovierating.setMapperClass(PredictMovieRating.FirstMapper.class);
Jpredictmovierating.setReducerClass(PredictMovieRating.FirstReducer.class);
Jpredictmovierating.setOutputKeyClass(Text.class);
Jpredictmovierating.setOutputValueClass(Text.class);
//输入为数据集的路径
FileInputFormat.addInputPath(Jpredictmovierating,new Path(dataset));
//输出为用户对当前电影movieId的评分的路径
FileOutputFormat.setOutputPath(Jpredictmovierating, new Path(predictmovierating+movieId));
Jpredictmovierating.waitForCompletion(true);
//删除电影的相似度,若空间足够大,可以不用删除
dstPath = new Path(moviesimilar+movieId);
dhfs = dstPath.getFileSystem(conf) ;
if(dhfs.exists(dstPath)){
dhfs.delete(dstPath, true);
}
//删除最高的电影相似度,若空间足够大,可以不用删除
dstPath = new Path(mostsimilarmovies+movieId);
dhfs = dstPath.getFileSystem(conf) ;
if(dhfs.exists(dstPath)){
dhfs.delete(dstPath, true);
}
}
//求出用户对电影的实际值与预测值的差值平方
System.out.println("-------------------CalculateDifference--------------------");
Job Jcalculatedifference = new Job(conf, "CalculateDifference");
Jcalculatedifference.setJarByClass(CalculateDifference.class);
Jcalculatedifference.setMapperClass(CalculateDifference.FirstMapper.class);
Jcalculatedifference.setReducerClass(CalculateDifference.FirstReducer.class);
Jcalculatedifference.setOutputKeyClass(Text.class);
Jcalculatedifference.setOutputValueClass(Text.class);
//输入为测试集的路径
FileInputFormat.addInputPath(Jcalculatedifference,new Path(extracttestdata+"/part-r-00000"));
for(String s : list){
movieId = s.substring(0, s.indexOf(",")).trim().replace("\t|\\s+", "");
//输入为所有的电影预测值路径
FileInputFormat.addInputPath(Jcalculatedifference,new Path(predictmovierating+movieId+"/part-r-00000"));
}
//输出为用户对电影的实际值与预测值的差值平方的路径
FileOutputFormat.setOutputPath(Jcalculatedifference, new Path(difference));
Jcalculatedifference.waitForCompletion(true);
//求出预测电影的均方根误差RMSE
System.out.println("--------------------CalculateRMSE--------------------");
Job JcalculateRMSE = new Job(conf, "CalculateRMSE");
JcalculateRMSE.setJarByClass(CalculateRMSE.class);
JcalculateRMSE.setMapperClass(CalculateRMSE.FirstMapper.class);
JcalculateRMSE.setReducerClass(CalculateRMSE.FirstReducer.class);
JcalculateRMSE.setOutputKeyClass(Text.class);
JcalculateRMSE.setOutputValueClass(Text.class);
//用户对电影的实际值与预测值的差值平方的路径
FileInputFormat.addInputPath(JcalculateRMSE,new Path(difference+"/part-r-00000"));
//输出为预测电影的均方根误差RMSE的路径
FileOutputFormat.setOutputPath(JcalculateRMSE, new Path(rmse));
JcalculateRMSE.waitForCompletion(true);
}
//从HDFS文件系统中读取文件信息
public static String getfileinfo(String filename) {
Configuration config = new Configuration();
FSDataInputStream dis = null;
String result = "";
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
FileSystem hdfs = FileSystem.get(config);
dis = hdfs.open(new Path(filename));
//写入ByteArrayOutputStream
IOUtils.copyBytes(dis, baos, 4096, false);
result = baos.toString();
}catch (IOException e) {
e.printStackTrace();
}
finally{
IOUtils.closeStream(dis);
}
return result;
}
}