在矩阵分解中, 有类问题比较常见,即矩阵的元素只有0和1, 对应实际应用中的场景是:用户对新闻的点击情况,对某些物品的购买情况等。基于graphchi里面的矩阵分解结果不太理想,调研了下相关的文献,代码主要实现了基于PLSA的分解方法,详细请参考后面的参考文献
#!/usr/local/bin/python #-*-coding:utf-8-*- import sys import math import numpy as np import string import random """ reimplement algorithm of aspect_bernoulli.m which may be usefull in one class matrix factorization """ """ get matrix that all elements are 1 """ def get_unit_matrix(row, col): return np.ones((row, col), dtype=int) def norm_zero(array): col, row = array.shape for c in xrange(col): for r in xrange(row): if array[c][r] < 0.000001: array[c][r] = 0.000001 return array """ X: binary data matrix, each column is an observation K: number of aspects(components) to estimate iter: max number of iterations to do ufile: save matrix result u vfile: save martix result v """ def run_bernoulli(X, K, iter, ufile, vfile): X = np.array(X) T,N = X.shape S = np.random.rand(K, N) S_SUM = np.add.reduce(S) S_SUM_EXT = np.tile(S_SUM, (K, 1)) S = S/S_SUM_EXT A = np.random.rand(T, K) UM_A = get_unit_matrix(T, K) A_TEMP = UM_A - A UM_X = get_unit_matrix(T, N) X_TEMP = UM_X - X l = [] for i in xrange(iter): #STEP1 AS = np.dot(A, S) #A*S AS_NORM = norm_zero(AS) #max(eps, A*S) X_AS_NORM = X/AS_NORM #(X./max(eps, A*S)) FIRST = np.dot(A.T, X_AS_NORM) #A'*(X./max(eps, A*S)) S_1 = S*FIRST #S.*(A'*(X./max(eps, A*S))) AS_TEMP = UM_X - AS #1-A*S AS_TEMP_NORM = norm_zero(AS_TEMP) #max(eps, 1-A*S) X_AS_TEMP_NORM = X_TEMP/AS_TEMP_NORM #(1-x)./max(eps, 1-A*S) SECOND = np.dot(A_TEMP.T, X_AS_TEMP_NORM) #(1-A)'*((1-x)./max(eps, 1-A*S)) S_2 = S*SECOND #S.*((1-A)'*((1-x)./max(eps, 1-A*S))) S = S_1 + S_2 #S.*(A'*(X./max(eps, A*S))) + S.*((1-A)'*((1-x)./max(eps, 1-A*S))) S_SUM = np.add.reduce(S) S_SUM_EXT = np.tile(S_SUM, (K, 1)) S = S/S_SUM_EXT #STEP2 AS = np.dot(A, S) #A*S AS_NORM = norm_zero(AS) #max(eps, A*S) X_AS_NORM = X/AS_NORM #(X./max(eps, A*S)) A = A* np.dot(X_AS_NORM, S.T) #STEP3 A_TEMP_S = np.dot(A_TEMP, S) #A1*S A_TEMP_S_NORM = norm_zero(A_TEMP_S) #max(eps, A1*S) A_TEMP_S_NORM_X = X_TEMP/A_TEMP_S_NORM #(1-X)./max(eps, A1*S) A_TEMP = A_TEMP * (np.dot(A_TEMP_S_NORM_X, S.T)) #A1.*(((1-x)./max(eps, A1*S))*S') A = A/(A + A_TEMP) #A./(A+A1) A_TEMP = UM_A - A #1-A #STEP4 compute loss function AS = np.dot(A, S) #A*S AS_NORM = norm_zero(AS) #max(eps, A*S) AS_NORM = np.log(AS_NORM) #log(max(eps, A*S)) LOSS_FIRST = X * AS_NORM # X.*log(max(eps, A*S)) AS_TEMP = UM_X - AS #1-A*S AS_TEMP_NORM = norm_zero(AS_TEMP) #max(eps, 1-A*S) AS_TEMP_NORM = np.log(AS_TEMP_NORM) #log(max(eps, 1-A*S)) LOSS_SECOND = X_TEMP * AS_TEMP_NORM #(1-X).*log(max(eps, 1-A*S)) LOSS = LOSS_FIRST + LOSS_SECOND #X.*log(max(eps, A*S)) + (1-X).*log(max(eps, 1-A*S)) LOSS_SUM = np.add.reduce(LOSS) #sum(X.*log(max(eps, A*S)) + (1-X).*log(max(eps, 1-A*S))) loss = np.sum(LOSS_SUM)/(N*T) #sum(sum(X.*log(max(eps, A*S)) + (1-X).*log(max(eps, 1-A*S))))/(N*T) l.append(loss) rmse = estimate_rmse(X, AS) if i > 1: if math.fabs(l[i] - l[i-1]) < 0.00001:break print "iter = %d, loss = %f, rmse = %f"%(i, loss, rmse) np.savetxt(ufile, A) np.savetxt(vfile, S)
参考文献:
1. The aspect Bernoulli model: multiple causes of presences and absences (文章算法对应文章)
2. One-Class Matrix Completion with Low-Density Factorizations