用户
/
文档
|
文档
A
|
文档
B
|
文档
C
|
文档
D
|
用户
A
|
√
|
√
|
推荐?
|
推荐?
|
用户
B
|
√
|
√
|
√
|
|
用户
C
|
√
|
√
|
√
|
|
用户
D
|
√
|
√
|
用户行为
|
类型
|
特征
|
作用
|
评分
|
显式
|
整数量化值[0,n]
|
可以得到精确偏好
|
投票
|
显式
|
布尔量化值0|1
|
可以得到精确偏好
|
转发
|
显式
|
布尔量化值0|1
|
可以得到精确偏好
|
保存书签
|
显式
|
布尔量化值0|1
|
可以得到精确偏好
|
标记书签Tag
|
显式
|
一些单词
|
需要进一步分析得到偏好
|
评论
|
显式
|
一些文字
|
需要进一步分析得到偏好
|
点击流
|
隐式
|
一组点击记录
|
需要进一步分析得到偏好
|
页面停留时间
|
隐式
|
一组时间信息
|
噪音偏大,不好利用
|
购买
|
隐式
|
布尔量化值0|1
|
可以得到精确偏好
|
用户
|
苹果
|
小米
|
魅族
|
zhangsan
|
5
|
5
|
2
|
Lisi
|
3
|
5
|
4
|
wangwu
|
1
|
2
|
5
|
def OsDistance(vector1, vector2):
sqDiffVector = vector1-vector2
sqDiffVector=sqDiffVector**2
sqDistances = sqDiffVector.sum()
distance = sqDistances**0.5
return distance
|
def cosSim(inA,inB):
num = float(inA.T*inB)
denom = la.norm(inA)*la.norm(inB)
return 0.5+0.5*(num/denom)
|
1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968
1::3408::4::978300275
1::2355::5::978824291
1::1197::3::978302268
1::1287::5::978302039
1::2804::5::978300719
1::594::4::978302268
1::919::4::978301368
|
>>> import pandas as pd
>>> from pandas import Series,DataFrame
>>> rnames = ['user_id','movie_id','rating','timestamp']
>>> ratings = pd.read_table(r'ratings.dat',sep='::',header=None,names=rnames)
>>> ratings[:3]
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
[3 rows x 4 columns]
|
>>> data = ratings.pivot(index='user_id',columns='movie_id',values='rating') #形成一个透视表
>>> data[:5]
|
>>> foo = DataFrame(np.empty((len(data.index),len(data.index)),dtype=int),index=data.index,columns=data.index)
#print(empt.shape): (6040, 6040)
>>> for i in foo.index:
for j in foo.columns:
foo.ix[i,j] = data.ix[i][data.ix[j].notnull()].dropna().count()
|
>>> for i in foo.index:
foo.ix[i,i]=0 #先把对角线的值设为 0
>>> ser = Series(np.zeros(len(foo.index)))
>>> for i in foo.index:
ser[i]=foo[i].max() #计算每行中的最大值
>>> ser.idxmax() #返回ser的最大值所在的行号
4169
>>> ser[4169] #取得最大值
998
>>> foo[foo==998][4169].dropna() #取得另一个 user_id
424 4169
Name: user_id, dtype: float64
|
>>> data.ix[4169].corr(data.ix[424])
0.45663851303413217
>>> test = data.reindex([424,4169],columns=data.ix[4169][data.ix[424].notnull()].dropna().index)
>>> test
movie_id 2 6 10 11 12 17 ...
424 4 4 4 4 1 5 ...
4169 3 4 4 4 2 5 ...
>>> test.ix[424].value_counts(sort=False).plot(kind='bar')
>>> test.ix[4169].value_counts(sort=False).plot(kind='bar')
|
>>> periods_test = DataFrame(np.zeros((20,7)),columns=[10,20,50,100,200,500,998])
>>> for i in periods_test.index:
for j in periods_test.columns:
sample = test.reindex(columns=np.random.permutation(test.columns)[:j])
periods_test.ix[i,j] = sample.iloc[0].corr(sample.iloc[1])
>>> periods_test[:5]
10 20 50 100 200 500 998
0 -0.306719 0.709073 0.504374 0.376921 0.477140 0.426938 0.456639
1 0.386658 0.607569 0.434761 0.471930 0.437222 0.430765 0.456639
2 0.507415 0.585808 0.440619 0.634782 0.490574 0.436799 0.456639
3 0.628112 0.628281 0.452331 0.380073 0.472045 0.444222 0.456639
4 0.792533 0.641503 0.444989 0.499253 0.426420 0.441292 0.456639
[5 rows x 7 columns]
>>> periods_test.describe()
10 20 50 100 200 500 #998略
count 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
mean 0.346810 0.464726 0.458866 0.450155 0.467559 0.452448
std 0.398553 0.181743 0.103820 0.093663 0.036439 0.029758
min -0.444302 0.087370 0.192391 0.242112 0.412291 0.399875
25% 0.174531 0.320941 0.434744 0.375643 0.439228 0.435290
50% 0.487157 0.525217 0.476653 0.468850 0.472562 0.443772
75% 0.638685 0.616643 0.519827 0.500825 0.487389 0.465787
max 0.850963 0.709073 0.592040 0.634782 0.546001 0.513486
[8 rows x 7 columns]
|
>>> check_size = 1000
>>> check = {}
>>> check_data = data.copy() #复制一份 data 用于检验,以免篡改原数据
>>> check_data = check_data.ix[check_data.count(axis=1)>200] #滤除评价数小于200的用户
>>> for user in np.random.permutation(check_data.index):
movie = np.random.permutation(check_data.ix[user].dropna().index)[0]
check[(user,movie)] = check_data.ix[user,movie]
check_data.ix[user,movie] = np.nan
check_size -= 1
if not check_size:
break
>>> corr = check_data.T.corr(min_periods=200)
>>> corr_clean = corr.dropna(how='all')
>>> corr_clean = corr_clean.dropna(axis=1,how='all') #删除全空的行和列
>>> check_ser = Series(check) #这里是被提取出来的 1000 个真实评分
>>> check_ser[:5]
(15, 593) 4
(23, 555) 3
(33, 3363) 4
(36, 2355) 5
(53, 3605) 4
dtype: float64
|
>>> result = Series(np.nan,index=check_ser.index)
>>> for user,movie in result.index: #这个循环看着很乱,实际内容就是加权平均而已
prediction = []
if user in corr_clean.index:
corr_set = corr_clean[user][corr_clean[user]>0.1].dropna() #仅限大于 0.1 的用户
else:continue
for other in corr_set.index:
if not np.isnan(data.ix[other,movie]) and other != user:#注意bool(np.nan)==True
prediction.append((data.ix[other,movie],corr_set[other]))
if prediction:
result[(user,movie)] = sum([value*weight for value,weight in prediction])/sum([pair[1] for pair in prediction])
>>> result.dropna(inplace=True)
>>> len(result)#随机抽取的 1000 个用户中也有被 min_periods=200 刷掉的
862
>>> result[:5]
(23, 555) 3.967617
(33, 3363) 4.073205
(36, 2355) 3.903497
(53, 3605) 2.948003
(62, 1488) 2.606582
dtype: float64
>>> result.corr(check_ser.reindex(result.index))
0.436227437429696
>>> (result-check_ser.reindex(result.index)).abs().describe()#推荐期望与实际评价之差的绝对值
count 862.000000
mean 0.785337
std 0.605865
min 0.000000
25% 0.290384
50% 0.686033
75% 1.132256
max 3.629720
dtype: float64
|
>>> corr = data.T.corr(min_periods=200)
>>> corr_clean = corr.dropna(how='all')
>>> corr_clean = corr_clean.dropna(axis=1,how='all')
|
>>> lucky = np.random.permutation(corr_clean.index)[0]
>>> gift = data.ix[lucky]
>>> gift = gift[gift.isnull()] #现在 gift 是一个全空的序列
|
>>> corr_lucky = corr_clean[lucky].drop(lucky)#lucky 与其他用户的相关系数 Series,不包含 lucky 自身
>>> corr_lucky = corr_lucky[corr_lucky>0.1].dropna() #筛选相关系数大于 0.1 的用户
>>> for movie in gift.index: #遍历所有lucky没看过的电影
prediction = []
for other in corr_lucky.index: #遍历所有与lucky 相关系数大于 0.1 的用户
if not np.isnan(data.ix[other,movie]):
prediction.append((data.ix[other,movie],corr_clean[lucky][other]))
if prediction:
gift[movie] = sum([value*weight for value,weight in prediction])/sum([pair[1] for pair in prediction])
>>> gift.dropna().order(ascending=False) #将 gift 的非空元素按降序排列
movie_id
3245 5.000000
2930 5.000000
2830 5.000000
2569 5.000000
1795 5.000000
981 5.000000
696 5.000000
682 5.000000
666 5.000000
572 5.000000
1420 5.000000
3338 4.845331
669 4.660464
214 4.655798
3410 4.624088
...
2833 1
2777 1
2039 1
1773 1
1720 1
1692 1
1538 1
1430 1
1311 1
1164 1
843 1
660 1
634 1
591 1
56 1
Name: 3945, Length: 2991, dtype: float64
|
3.1、计算距离的数学公式