今日实现第一个推荐算法,在”机器学习实战“一书中找到了SVD方法一章练习。这里总结下笔记经验,与大家分享 。
import numpy as np
data = [[1,2,3],[2,2,3],[3,3,3]]
U,S,V = np.linalg.svd(data)
>>> U
array([[-0.48273945, 0.76567677, 0.42509024],
[-0.54696309, 0.11548497, -0.82915294],
[-0.68395468, -0.63277351, 0.36304778]])
>>> S
array([ 7.51653849, 1.17761678, 0.33892171])
>>> V
array([[-0.48273945, -0.54696309, -0.68395468],
[-0.76567677, -0.11548497, 0.63277351],
[-0.42509024, 0.82915294, -0.36304778]])
其中S向量只存储了对角元素的成分,可以大大节省存储空间。接下来我们就需要保留部分奇异值,对于保留的数量,一个典型的方法就是保留矩阵中90%的能量信息。能量信息为奇异值的平和总和。
user = 0
dataMat = mat(loadExData2())
# 直接实现
rec1 = recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst)
# 采用SVD方法实现
rec2 = recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=svdEst)
(Pdb) rec1
[(0, 5.0), (1, 5.0), (2, 5.0)]
(Pdb) rec2
[(7, 4.5148349067003304), (8, 4.514365463123859), (0, 4.5142831096323039)]
#!/usr/bin/env python
# -*- coding: UTF-8
from numpy import *
from numpy import linalg as la
import numpy as np
def loadExData2():
return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
[0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
[0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
[3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
[5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
[0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
[4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
[0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
[0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]
def ecludSim(inA,inB):
return 1.0/(1.0 + la.norm(inA - inB))
def pearsSim(inA,inB):
if len(inA) < 3 : return 1.0
return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1]
def cosSim(inA,inB):
num = float(inA.T*inB)
denom = la.norm(inA)*la.norm(inB)
return 0.5+0.5*(num/denom)
def standEst(dataMat, user, simMeas, item):
'''
计算相似性
数据矩阵,用户ID, 相似度方法, 物品ID
simMeas: ecludSim, pearsSim, cosSim
'''
n = shape(dataMat)[1]
simTotal = 0.0; ratSimTotal = 0.0
# 不同的物品
for j in range(n):
userRating = dataMat[user,j]
if userRating == 0: continue
# 对两个物品,item和j,考虑都有评价的部分
overLap = nonzero(logical_and(dataMat[:,item].A>0, \
dataMat[:,j].A>0))[0]
if len(overLap) == 0:
similarity = 0
else:
similarity = simMeas(dataMat[overLap,item], \
dataMat[overLap,j])
print 'the %d and %d similarity is: %f' % (item, j, similarity)
simTotal += similarity
ratSimTotal += similarity * userRating
# 返回归一化的结果
if simTotal == 0:
return 0
else:
return ratSimTotal/simTotal
def svdEst(dataMat, user, simMeas, item):
'''
基于SVD的相似性计算
'''
n = shape(dataMat)[1]
simTotal = 0.0; ratSimTotal = 0.0
U,Sigma,VT = la.svd(dataMat)
# 只考虑前四个元素,转化成矩阵形式
Sig4 = mat(eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix
# 将数据降维转化
xformedItems = dataMat.T * U[:,:4] * Sig4.I #create transformed items
for j in range(n):
userRating = dataMat[user,j]
if userRating == 0 or j==item: continue
similarity = simMeas(xformedItems[item,:].T,\
xformedItems[j,:].T)
print 'the %d and %d similarity is: %f' % (item, j, similarity)
simTotal += similarity
ratSimTotal += similarity * userRating
if simTotal == 0: return 0
else: return ratSimTotal/simTotal
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
'''
针对某一用户,user,进行推荐。返回dictionary
N : 推荐物品个数
simMeas : 相似性计算方法
estMethod : 针对某一用户的某一物品,计算相似性的程序,返回向量。
'''
unratedItems = nonzero(dataMat[user,:].A==0)[1]#find unrated items
if len(unratedItems) == 0: return 'you rated everything'
itemScores = []
for item in unratedItems:
estimatedScore = estMethod(dataMat, user, simMeas, item)
itemScores.append((item, estimatedScore))
return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]
def printMat(inMat, thresh=0.8):
for i in range(32):
for k in range(32):
if float(inMat[i,k]) > thresh:
print 1,
else: print 0,
print ''
def imgCompress(numSV=3, thresh=0.8):
# 用svd方法进行图像压缩
myl = []
for line in open('0_5.txt').readlines():
newRow = []
for i in range(32):
newRow.append(int(line[i]))
myl.append(newRow)
myMat = mat(myl)
print "****original matrix******"
printMat(myMat, thresh)
U,Sigma,VT = la.svd(myMat) # svd
SigRecon = mat(zeros((numSV, numSV)))
for k in range(numSV):#construct diagonal matrix from vector
SigRecon[k,k] = Sigma[k]
# reconstruct with the first numSV feature
reconMat = U[:,:numSV]*SigRecon*VT[:numSV,:]
print "****reconstructed matrix using %d singular values******" % numSV
printMat(reconMat, thresh)