这是老师和学姐曾经发表过的一篇论文,主要是讲解关于评分系统的一个算法。
这篇中使用的是leave one out的方法, 即从头开始遍历,每次选出一个作为测试集,其余剩下的作为训练集,每次都进行打分,最后总分除以个数,得到预测均分。这个算法很公平,因为每个都当成了测试集来预测,但是由于数量庞大,如果说算法效率不够高的话,则花费时间将很多很多。
package machine;
import java.io.*;
public class MBR {
/**
* Default rating for 1-5 points. 如果找不到邻居,默认为3分
*/
public static final double DEFAULT_RATING = 3.0;
/**
* The total number of users.
*/
private int numUsers;
/**
* The total number of items.
*/
private int numItems;
/**
* The total number of ratings (non-zero values) 非零的评分个数
*/
private int numRatings;
/**
* The predictions. 长度和numRatings一样
*/
private double[] predictions;
/**
* Compressed rating matrix. User-item-rating triples. 压缩的评分矩阵,作为一个三元组来压缩的
*/
private int[][] compressedRatingMatrix;
/**
* The degree of users (how many item he has rated). 用户到底看了多少的电影,一行非零个数
*/
private int[] userDegrees;
/**
* The average rating of the current user. 用户平均评分
*/
private double[] userAverageRatings;
/**
* The degree of users (how many item he has rated).
*/
private int[] itemDegrees;
/**
* The average rating of the current item.
*/
private double[] itemAverageRatings;
/**
* The first user start from 0. Let the first user has x ratings, the second
* user will start from x. 开端的下标,例如0,50等,可用于直接寻址
*/
private int[] userStartingIndices;
/**
* Number of non-neighbor objects. 非邻居
*/
private int numNonNeighbors;
/**
* The radius (delta) for determining the neighborhood. 相当于delta,评分的差距
*/
private double radius;
/**
*************************
* Construct the rating matrix.
*
* @param paraRatingFilename the rating filename.
* @param paraNumUsers number of users
* @param paraNumItems number of items
* @param paraNumRatings number of ratings
*************************
*/
public MBR(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
// Step 1. Initialize these arrays
numItems = paraNumItems;
numUsers = paraNumUsers;
numRatings = paraNumRatings;
userDegrees = new int[numUsers];
userStartingIndices = new int[numUsers + 1];
userAverageRatings = new double[numUsers];
itemDegrees = new int[numItems];
// 多少个评分,行数是总共用户的数量,三列
compressedRatingMatrix = new int[numRatings][3];
itemAverageRatings = new double[numItems];
predictions = new double[numRatings];
System.out.println("Reading " + paraFilename);
// Step 2. Read the data file.
File tempFile = new File(paraFilename);
if (!tempFile.exists()) {
System.out.println("File " + paraFilename + " does not exists.");
System.exit(0);
} // Of if
BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
String tempString;
String[] tempStrArray;
int tempIndex = 0;
userStartingIndices[0] = 0;
userStartingIndices[numUsers] = numRatings;
// 一行一行的读
while ((tempString = tempBufReader.readLine()) != null) {
// Each line has three values
tempStrArray = tempString.split(",");
compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]);
compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]);
compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]);
// 看了就加一
userDegrees[compressedRatingMatrix[tempIndex][0]]++;
itemDegrees[compressedRatingMatrix[tempIndex][1]]++;
if (tempIndex > 0) {
// Starting to read the data of a new user.
if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
} // Of if
} // Of if
tempIndex++;
} // Of while
tempBufReader.close();
double[] tempUserTotalScore = new double[numUsers];
double[] tempItemTotalScore = new double[numItems];
for (int i = 0; i < numRatings; i++) {
tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
} // Of for i
for (int i = 0; i < numUsers; i++) {
userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
} // Of for i
for (int i = 0; i < numItems; i++) {
itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
} // Of for i
}// Of the first constructor
/**
*************************
* Set the radius (delta).
*
* @param paraRadius The given radius.
*************************
*/
public void setRadius(double paraRadius) {
if (paraRadius > 0) {
radius = paraRadius;
} else {
radius = 0.1;
} // Of if
}// Of setRadius
/**
*************************
* Leave-one-out prediction. The predicted values are stored in predictions.
*
* @see predictions
*************************
*/
public void leaveOneOutPrediction() {
double tempItemAverageRating;
// Make each line of the code shorter.
int tempUser, tempItem, tempRating;
System.out.println("\r\nLeaveOneOutPrediction for radius " + radius);
numNonNeighbors = 0;
// 10万次,捂住一个,测一次
for (int i = 0; i < numRatings; i++) {
tempUser = compressedRatingMatrix[i][0];
tempItem = compressedRatingMatrix[i][1];
tempRating = compressedRatingMatrix[i][2];
// Step 1. Recompute average rating of the current item.
// 更新:前面乘:总分,减去Rating
tempItemAverageRating = (itemAverageRatings[tempItem] * itemDegrees[tempItem] - tempRating)
/ (itemDegrees[tempItem] - 1);
// Step 2. 计算邻居,Recompute neighbors, at the same time obtain the ratings
// Of neighbors.
int tempNeighbors = 0;
double tempTotal = 0;
int tempComparedItem;
// 当前用户,和下一个用户之间的
for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
tempComparedItem = compressedRatingMatrix[j][1];
if (tempItem == tempComparedItem) {
continue;// Ignore itself.
} // Of if
// 比较均值差距,也就是差距半径,例如小于delta的0.3
if (Math.abs(tempItemAverageRating - itemAverageRatings[tempComparedItem]) < radius) {
// 小于的话则加总共邻居就加一
tempTotal += compressedRatingMatrix[j][2];
tempNeighbors++;
} // Of if
} // Of for j
// Step 3. Predict as the average value of neighbors.
if (tempNeighbors > 0) {
// 总分除以邻居个人,得到评分
predictions[i] = tempTotal / tempNeighbors;
} else {
// 否则赋予一个默认值。
predictions[i] = DEFAULT_RATING;
// 找不到邻居的个数,随便统计一下吧,好像也没啥作用?
numNonNeighbors++;
} // Of if
} // Of for i
}// Of leaveOneOutPrediction
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
* @author Fan Min
*************************
*/
public double computeMAE() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
return tempTotalError / predictions.length;
}// Of computeMAE
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
* @author Fan Min
*************************
*/
public double computeRSME() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
// 预测的减去实际的评分,再平方,相加,再除,再更号
tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
* (predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
double tempAverage = tempTotalError / predictions.length;
return Math.sqrt(tempAverage);
}// Of computeRSME
/**
*************************
* The entrance of the program.
*
* @param args Not used now.
*************************
*/
public static void main(String[] args) {
try {
MBR tempRecommender = new MBR("O:/javasampledata-master/movielens-943u1682m.txt", 943, 1682, 100000);// 10万个评分
for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
tempRecommender.setRadius(tempRadius);
tempRecommender.leaveOneOutPrediction();
/*
* MAE: 预测3,实际3,差距为0 ,预测3.5,实际4,差距0.5,再除以总的个数,就是个平均误差。 mean absolute error
*/
double tempMAE = tempRecommender.computeMAE();
/*
* RSME: 相当于就是另外一种评价方式(公式),在最后返回的时候开了更号,MAE是绝对距离的差值
*/
double tempRSME = tempRecommender.computeRSME();
System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME
+ ", numNonNeighbors = " + tempRecommender.numNonNeighbors);
} // Of for tempRadius
} catch (Exception ee) {
System.out.println(ee);
} // Of try
}// Of main
}// Of class MBR
具体的代码步骤以及理解都在上面你的注释里面了,这里就没有单独写出来了。 这种算法与knn相比较还是有明显差别的,m-distence是用的一个阈值delta,相当于一个范围,在这个范围内的都是自己的邻居。