Python3《机器学习实战》代码笔记(十四)--- SVD算法

参考资料:

机器学习实战

'''
@version: 0.0.1
@Author: tqrs
@dev: python3 vscode
@Date: 2019-11-12 21:09:57
@LastEditTime: 2019-11-12 21:58:07
@FilePath: \\机器学习实战\\14-SVD算法\\SVD.py
@Descripttion: SVD是从有噪声数据中抽取相关特征,利用SVD来逼近矩阵并从中提取重要特征,通过保留矩阵80%~90%的能量,
就可以得到重要的特征并去掉噪声
'''


import numpy as np


def loadExData():
    return [[0, 0, 0, 2, 2],
            [0, 5, 0, 3, 3],
            [0, 0, 0, 1, 1],
            [1, 1, 1, 0, 0],
            [2, 1, 2, 4, 0],
            [5, 5, 5, 3, 0],
            [1, 1, 1, 2, 0]]


def loadExData2():
    return [[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
            [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
            [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
            [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
            [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
            [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
            [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
            [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
            [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
            [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
            [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]


def ecludSim(inA, inB):
    """计算两个列向量的欧氏距离"""
    return 1.0 / (1.0 + np.linalg.norm(inA - inB))


def pearsSim(inA, inB):
    """计算两个列向量的皮尔逊相关系数"""
    if len(inA) < 3:
        return 1.0
    return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar=False)[0][1]


def cosSim(inA, inB):
    """计算两个列向量的是余弦相似度"""
    num = float(inA.T * inB)
    denom = np.linalg.norm(inA) * np.linalg.norm(inB)
    return 0.5 + 0.5 * (num / denom)


def test_Sim():
    """测试三种距离算法"""
    myDat = np.mat(loadExData())
    ecl = ecludSim(myDat[:, 0], myDat[:, 4])
    print(ecl)
    cos = cosSim(myDat[:, 0], myDat[:, 4])
    print(cos)
    pear = pearsSim(myDat[:, 0], myDat[:, 4])
    print(pear)


def standEst(dataMat, user, simMeas, item):
    """
    [summary]:计算在给定相似度计算方法的 条件下,用户对物品的估计评分值

    Arguments:
        dataMat  -- 数据集
        user -- 
        simMeas  -- 
        item -- 
    
    Returns:
        [type] -- [description]
    """
    n = np.shape(dataMat)[1] # 行数
    simTotal = 0.0
    ratSimTotal = 0.0

    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0 or j == item:
            continue
        # 寻找两个用户都评级的物品
        overlap = np.nonzero(
            np.logical_and(dataMat[:, item].A > 0, dataMat[:, j].A > 0))[0]
        # 没有任何重合元素,相似度为0
        if len(overlap) == 0:
            similarity = 0
        else:
            similarity = simMeas(dataMat[overlap, j], dataMat[overlap, item])
        print('the {:d} and {:d} similarity is:{:.6f}'.format(
            item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal


def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    """
    [summary]:
        (1) 寻找用户没有评级的菜肴,即在用户-物品矩阵中的0值
        (2) 在用户没有评级的所有物品中,对每个物品预计一个可能的评级分数。这就是说,我们认为用户可能会对物品的打分(这就是相似度计算的初衷)
        (3) 对这些物品的评分从高到低进行排序,返回前N个物品
    
    Arguments:
        dataMat {[type]} -- [description]
        user {[type]} -- [description]
    
    Keyword Arguments:
        N {int} -- [description] (default: {3})
        simMeas {[type]} -- [description] (default: {cosSim})
        estMethod {[type]} -- [description] (default: {standEst})
    
    Returns:
        [type] -- [description]
    """
    # 寻找未评级物品
    unratedItems = np.nonzero(dataMat[user, :].A == 0)[1]
    if len(unratedItems) == 0:
        return 'all rated'
    itemScores = [(item, estMethod(dataMat, user, simMeas, item))
                  for item in unratedItems]
    # itemScores = []
    # for item in unratedItems:
    #     estimatedScore=estMethod(dataMat,user,simMeas,item)
    #     itemScores.append((item,estimatedScore))
    # 寻找前N个未评级物品
    return sorted(itemScores, key=lambda x: x[1], reverse=True)[:N]


def test_recommend():
    myDat = np.mat(loadExData())
    myDat[0, 1] = myDat[0, 0] = myDat[1, 0] = myDat[2, 0] = 4
    myDat[3, 3] = 2
    print(recommend(myDat, 2))


def svdEst(dataMat, user, simMeas, item):
    """
    [summary]:数对给定用户给定物品构建了一个评分估计值
    
    Arguments:
        dataMat {[type]} -- [description]
        user {[type]} -- [description]
        simMeas {[type]} -- [description]
        item {[type]} -- [description]
    
    Returns:
        [type] -- [description]
    """
    # 行数
    n = np.shape(dataMat)[1]
    simTotal = 0.0
    ratSimTotal = 0.0
    # SVD分解
    U, Sigma, VT = np.linalg.svd(dataMat)
    # 构建对角矩阵,Sigma[:4]只包含90%能量值的奇异值
    Sig4 = np.mat(np.eye(4) * Sigma[:4])
    xfromedItems = dataMat.T * U[:, :4] * Sig4.I
    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0 or j == item:
            continue
        similarity = simMeas(xfromedItems[item, :].T, xfromedItems[j, :].T)
        print('the {:d} and {:d} similarity is:{:.6f}'.format(
            item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal


def test_svdEst():
    myDat = np.mat(loadExData2())
    # print(recommend(myDat, 1, estMethod=svdEst))
    print(recommend(myDat, 1, estMethod=svdEst, simMeas=pearsSim))


# 打印矩阵
def printMat(inMat, thresh=0.8):
    for i in range(32):
        for k in range(32):
            if float(inMat[i, k]) > thresh:
                print(1, end='')  # 不换行
            else:
                print(0, end='')
        print(' ')


def imgCompress(numSV=3, thresh=0.8):
    myl = []
    for line in open(r'14-SVD算法\0_5.txt').readlines():
        newRow = []
        for i in range(32):
            newRow.append(int(line[i]))
        myl.append(newRow)
    myMat = np.mat(myl)
    print("****original matrix******")
    printMat(myMat, thresh)
    U, Sigma, VT = np.linalg.svd(myMat)
    SigRecon = np.mat(np.zeros((numSV, numSV)))
    for k in range(numSV):  # construct diagonal matrix from vector
        SigRecon[k, k] = Sigma[k]
    reconMat = U[:, :numSV] * SigRecon * VT[:numSV, :]
    print("****reconstructed matrix using %d singular values******" % numSV)
    printMat(reconMat, thresh)


if __name__ == '__main__':
    # test_Sim()
    # test_recommend()
    # test_svdEst()
    imgCompress(2)

你可能感兴趣的:(机器学习)