推荐经典算法实现之BPMF(pymc3+MovieLen)

BPMF是用贝叶斯推断方法求解MF的概率模型,参考:https://gist.github.com/macks22/00a17b1d374dfc267a9a

1、利用其本身数据集的代码如下:

# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.22
@function: Implementing BPMF
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
           https://www.cs.toronto.edu/~amnih/papers/bpmf.pdf
@reference: https://gist.github.com/macks22/00a17b1d374dfc267a9a
'''
import sys
import time
import logging

import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as t
import scipy as sp
import math

DATA_NOT_FOUND = -1


# data from: https://gist.github.com/macks22/b40ac9c685e920ad3ca2
def read_jester_data(fname='/data/tmpexec/jester-dense-subset-100x20.csv'):
    """Read dense Jester dataset and split train/test data randomly.
    We use a 0.9:0.1 Train:Test split.
    """
    logging.info('reading data')
    try:
        data = pd.read_csv(fname)
    except IOError as err:
        print (str(err))
        url = 'https://gist.github.com/macks22/b40ac9c685e920ad3ca2'
        print ('download from: %s' % url)
        sys.exit(DATA_NOT_FOUND)

    # Calculate split sizes.
    logging.info('splitting train/test sets')
    n, m = data.shape           # # users, # jokes
    N = n * m                   # # cells in matrix
    test_size = int(N / 10)         # use 10% of data as test set
    train_size = N - test_size  # and remainder for training

    # Prepare train/test ndarrays.
    train = data.copy().values
    test = np.ones(data.shape) * np.nan

    # Draw random sample of training data to use for testing.
    tosample = np.where(~np.isnan(train))        # only sample non-missing values
    idx_pairs = list(zip(tosample[0], tosample[1]))    # zip row/col indices
    indices = np.arange(len(idx_pairs))      # indices of row/col index pairs
    sample = np.random.choice(indices, replace=False, size=test_size)  # draw sample

    # Transfer random sample from train set to test set.
    for idx in sample:
        idx_pair = idx_pairs[idx]         # retrieve sampled index pair
        test[idx_pair] = train[idx_pair]  # transfer to test set
        train[idx_pair] = np.nan          # remove from train set

    # Verify everything worked properly
    assert(np.isnan(train).sum() == test_size)
    assert(np.isnan(test).sum() == train_size)

    # Return the two numpy ndarrays
    return train, test


def build_pmf_model(train, alpha=2, dim=10, std=0.01):
    """Construct the Probabilistic Matrix Factorization model using pymc3.
    Note that the `testval` param for U and V initialize the model away from
    0 using a small amount of Gaussian noise.
    :param np.ndarray train: Training data (observed) to learn the model on.
    :param int alpha: Fixed precision to use for the rating likelihood function.
    :param int dim: Dimensionality of the model; rank of low-rank approximation.
    :param float std: Standard deviation for Gaussian noise in model initialization.
    """
    # Mean value imputation on training data.
    train = train.copy()
    nan_mask = np.isnan(train)
    train[nan_mask] = train[~nan_mask].mean()

    # Low precision reflects uncertainty; prevents overfitting.
    # We use point estimates from the data to intialize.
    # Set to mean variance across users and items.
    alpha_u = 1 / train.var(axis=1).mean()
    alpha_v = 1 / train.var(axis=0).mean()

    logging.info('building the PMF model')
    n, m = train.shape
    with pm.Model() as pmf:
        U = pm.MvNormal('U', mu=0, tau=alpha_u * np.eye(dim),shape=(n, dim), testval=np.random.randn(n, dim) * std)
        V = pm.MvNormal('V', mu=0, tau=alpha_v * np.eye(dim),shape=(m, dim), testval=np.random.randn(m, dim) * std)
        R = pm.Normal('R', mu=t.dot(U, V.T), tau=alpha * np.ones(train.shape),observed=train)
    logging.info('done building PMF model')
    return pmf


def build_bpmf_model(train, alpha=2, dim=10, std=0.01):
    """Build the original BPMF model, which we cannot sample from due to
    current limitations in pymc3's implementation of the Wishart distribution.
    """
    n, m = train.shape
    beta_0 = 1  # scaling factor for lambdas; unclear on its use

    # Mean value imputation on training data.
    train = train.copy()
    nan_mask = np.isnan(train)
    train[nan_mask] = train[~nan_mask].mean()

    logging.info('building the BPMF model')
    with pm.Model() as bpmf:
        # Specify user feature matrix
        lambda_u = pm.Wishart('lambda_u', n=dim, V=np.eye(dim), shape=(dim, dim),testval=np.random.randn(dim, dim) * std)
        mu_u = pm.Normal('mu_u', mu=0, tau=beta_0 * lambda_u, shape=dim,testval=np.random.randn(dim) * std)
        U = pm.MvNormal( 'U', mu=mu_u, tau=lambda_u, shape=(n, dim),testval=np.random.randn(n, dim) * std)

        # Specify item feature matrix
        lambda_v = pm.Wishart('lambda_v', n=dim, V=np.eye(dim), shape=(dim, dim),testval=np.random.randn(dim, dim) * std)
        mu_v = pm.Normal('mu_v', mu=0, tau=beta_0 * lambda_v, shape=dim,testval=np.random.randn(dim) * std)
        V = pm.MvNormal('V', mu=mu_v, tau=lambda_v, shape=(m, dim),testval=np.random.randn(m, dim) * std)

        # Specify rating likelihood function
        R = pm.Normal('R', mu=t.dot(U, V.T), tau=alpha * np.ones((n, m)),observed=train)

    logging.info('done building the BPMF model')
    return bpmf


def build_mod_bpmf_model(train, alpha=2, dim=10, std=0.01):
    """Build the modified BPMF model using pymc3. The original model uses
    Wishart priors on the covariance matrices. Unfortunately, the Wishart
    distribution in pymc3 is currently not suitable for sampling. This
    version decomposes the covariance matrix into:
        diag(sigma) \dot corr_matrix \dot diag(std).
    We use uniform priors on the standard deviations (sigma) and LKJCorr
    priors on the correlation matrices (corr_matrix):
        sigma ~ Uniform
        corr_matrix ~ LKJCorr(n=1, p=dim)
    """
    n, m = train.shape
    beta_0 = 1  # scaling factor for lambdas; unclear on its use

    # Mean value imputation on training data.
    train = train.copy()
    nan_mask = np.isnan(train)
    train[nan_mask] = train[~nan_mask].mean()

    # We will use separate priors for sigma and correlation matrix.
    # In order to convert the upper triangular correlation values to a
    # complete correlation matrix, we need to construct an index matrix:
    n_elem = int(dim * (dim - 1) / 2)
    tri_index = np.zeros([dim, dim], dtype=int)
    tri_index[np.triu_indices(dim, k=1)] = np.arange(n_elem)
    tri_index[np.triu_indices(dim, k=1)[::-1]] = np.arange(n_elem)

    logging.info('building the BPMF model')
    with pm.Model() as bpmf:
        # Specify user feature matrix
        sigma_u = pm.Uniform('sigma_u', shape=dim)
        corr_triangle_u = pm.LKJCorr('corr_u', n=1, p=dim, testval=np.random.randn(n_elem) * std)

        corr_matrix_u = corr_triangle_u[tri_index]
        corr_matrix_u = t.fill_diagonal(corr_matrix_u, 1)
        cov_matrix_u = t.diag(sigma_u).dot(corr_matrix_u.dot(t.diag(sigma_u)))
        lambda_u = t.nlinalg.matrix_inverse(cov_matrix_u)

        mu_u = pm.Normal('mu_u', mu=0, tau=beta_0 * t.diag(lambda_u), shape=dim,testval=np.random.randn(dim) * std)
        U = pm.MvNormal('U', mu=mu_u, tau=lambda_u, shape=(n, dim),testval=np.random.randn(n, dim) * std)

        # Specify item feature matrix
        sigma_v = pm.Uniform('sigma_v', shape=dim)
        corr_triangle_v = pm.LKJCorr('corr_v', n=1, p=dim,testval=np.random.randn(n_elem) * std)

        corr_matrix_v = corr_triangle_v[tri_index]
        corr_matrix_v = t.fill_diagonal(corr_matrix_v, 1)
        cov_matrix_v = t.diag(sigma_v).dot(corr_matrix_v.dot(t.diag(sigma_v)))
        lambda_v = t.nlinalg.matrix_inverse(cov_matrix_v)

        mu_v = pm.Normal('mu_v', mu=0, tau=beta_0 * t.diag(lambda_v), shape=dim,testval=np.random.randn(dim) * std)
        V = pm.MvNormal( 'V', mu=mu_v, tau=lambda_v, shape=(m, dim),testval=np.random.randn(m, dim) * std)

        # Specify rating likelihood function
        R = pm.Normal('R', mu=t.dot(U, V.T), tau=alpha * np.ones((n, m)),observed=train)

    logging.info('done building the BPMF model')
    return bpmf


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO,format='[%(asctime)s]: %(message)s')

    # Read data and build PMF model.
    train, test = read_jester_data()
    pmf = build_pmf_model(train)

    # Find mode of posterior using optimization
    with pmf:
        tstart = time.time()
        logging.info('finding PMF MAP using Powell optimization')
        #start = pm.find_MAP(fmin=sp.optimize.fmin_powell)
        start = pm.find_MAP()
        elapsed = time.time() - tstart
        logging.info('found PMF MAP in %d seconds' % int(elapsed))

    # Build the modified BPMF model using same default params as PMF.
    mod_bpmf = build_mod_bpmf_model(train)

    # Use PMF MAP to initialize sampling for modified BPMF.
    for key in mod_bpmf.test_point:
        if key not in start:
            start[key] = mod_bpmf.test_point[key]

    # Attempt to sample with modified BPMF
    # (this part raises PositiveDefiniteError when using the normal BPMF model).
    with mod_bpmf:
        nsamples = 100
        njobs = 2
        logging.info( 'drawing %d MCMC samples using %d jobs' % (nsamples, njobs))
        step = pm.NUTS(scaling=start)
        trace = pm.sample(nsamples, step, start=start, njobs=njobs) 
    
    with mod_bpmf:
        ppc = pm.sample_posterior_predictive(trace, progressbar=True)
        nR = np.mean(ppc['R'],0)#three dims, calcuate the mean with the first dim 

    def getrmse(predictions, targets):
        return np.sqrt(((predictions - targets) ** 2).mean())

    rmses=[]
    for i in range(test.shape[0]):
        for j in range(test.shape[1]):
            if math.isnan(test[i][j]) == False:
                rmse = getrmse(test[i][j],nR[i][j])
                rmses.append(rmse)
    print (np.mean(rmses))#4.120942853091463

2、用Movielen-1m数据集无法采样下去,原因未知,有兴趣者可研究,代码如下:

# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.22
@function: Implementing BPMF by MCMC
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
           https://www.cs.toronto.edu/~amnih/papers/bpmf.pdf
@reference: https://gist.github.com/macks22/00a17b1d374dfc267a9a
'''
import sys
import time
import logging

import pymc3 as pm
import numpy as np
import pandas as pd
import theano
import theano.tensor as t
import heapq
import math

def getTraindata():
    data = []
    filePath = '/data/fjsdata/ctKngBase/ml/ml-1m.train.rating'
    u = 0
    i = 0
    maxr = 0.0
    with open(filePath, 'r') as f:
        for line in f:
            if line:
                lines = line[:-1].split("\t")
                user = int(lines[0])
                item = int(lines[1])
                score = float(lines[2])
                data.append((user, item, score))
                if user > u: u = user
                if item > i: i = item
                if score > maxr: maxr = score
    print("Loading Success!\n"
                  "Data Info:\n"
                  "\tUser Num: {}\n"
                  "\tItem Num: {}\n"
                  "\tData Size: {}".format(u, i, len(data)))
    R = np.zeros([u+1, i+1], dtype=np.float32)
    for i in data:
        user = i[0]
        item = i[1]
        rating = i[2]
        R[user][item] = rating
    return R
def getTestdata():
    testset = []
    filePath = '/data/fjsdata/ctKngBase/ml/ml-1m.test.negative'
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != '':
            arr = line.split('\t')
            u = eval(arr[0])[0]
            testset.append([u, eval(arr[0])[1]])#one postive item
            for i in arr[1:]:
                testset.append([u, int(i)]) #99 negative items
            line = fd.readline()
    return testset


def build_pmf_model(train, alpha=2, dim=8, std=0.01):
    """Construct the Probabilistic Matrix Factorization model using pymc3.
    Note that the `testval` param for U and V initialize the model away from
    0 using a small amount of Gaussian noise.
    :param np.ndarray train: Training data (observed) to learn the model on.
    :param int alpha: Fixed precision to use for the rating likelihood function.
    :param int dim: Dimensionality of the model; rank of low-rank approximation.
    :param float std: Standard deviation for Gaussian noise in model initialization.
    """
    # Mean value imputation on training data.
    train = train.copy()
    nan_mask = np.isnan(train)
    train[nan_mask] = train[~nan_mask].mean()

    # Low precision reflects uncertainty; prevents overfitting.
    # We use point estimates from the data to intialize.
    # Set to mean variance across users and items.
    alpha_u = 1 / train.var(axis=1).mean()
    alpha_v = 1 / train.var(axis=0).mean()

    logging.info('building the PMF model')
    n, m = train.shape
    with pm.Model() as pmf:
        U = pm.MvNormal('U', mu=0, tau=alpha_u * np.eye(dim),shape=(n, dim), testval=np.random.randn(n, dim) * std)
        V = pm.MvNormal('V', mu=0, tau=alpha_v * np.eye(dim),shape=(m, dim), testval=np.random.randn(m, dim) * std)
        R = pm.Normal('R', mu=t.dot(U, V.T), tau=alpha * np.ones(train.shape),observed=train)
    logging.info('done building PMF model')
    return pmf


def build_bpmf_model(train, alpha=2, dim=8, std=0.01):
    """Build the original BPMF model, which we cannot sample from due to
    current limitations in pymc3's implementation of the Wishart distribution.
    """
    n, m = train.shape
    beta_0 = 1  # scaling factor for lambdas; unclear on its use

    # Mean value imputation on training data.
    train = train.copy()
    nan_mask = np.isnan(train)
    train[nan_mask] = train[~nan_mask].mean()

    logging.info('building the BPMF model')
    with pm.Model() as bpmf:
        # Specify user feature matrix
        lambda_u = pm.Wishart('lambda_u', n=dim, V=np.eye(dim), shape=(dim, dim),testval=np.random.randn(dim, dim) * std)
        mu_u = pm.Normal('mu_u', mu=0, tau=beta_0 * lambda_u, shape=dim,testval=np.random.randn(dim) * std)
        U = pm.MvNormal( 'U', mu=mu_u, tau=lambda_u, shape=(n, dim),testval=np.random.randn(n, dim) * std)

        # Specify item feature matrix
        lambda_v = pm.Wishart('lambda_v', n=dim, V=np.eye(dim), shape=(dim, dim),testval=np.random.randn(dim, dim) * std)
        mu_v = pm.Normal('mu_v', mu=0, tau=beta_0 * lambda_v, shape=dim,testval=np.random.randn(dim) * std)
        V = pm.MvNormal('V', mu=mu_v, tau=lambda_v, shape=(m, dim),testval=np.random.randn(m, dim) * std)

        # Specify rating likelihood function
        R = pm.Normal('R', mu=t.dot(U, V.T), tau=alpha * np.ones((n, m)),observed=train)

    logging.info('done building the BPMF model')
    return bpmf


def build_mod_bpmf_model(train, alpha=2, dim=8, std=0.01):
    """Build the modified BPMF model using pymc3. The original model uses
    Wishart priors on the covariance matrices. Unfortunately, the Wishart
    distribution in pymc3 is currently not suitable for sampling. This
    version decomposes the covariance matrix into:
        diag(sigma) \dot corr_matrix \dot diag(std).
    We use uniform priors on the standard deviations (sigma) and LKJCorr
    priors on the correlation matrices (corr_matrix):
        sigma ~ Uniform
        corr_matrix ~ LKJCorr(n=1, p=dim)
    """
    n, m = train.shape
    beta_0 = 1  # scaling factor for lambdas; unclear on its use

    # Mean value imputation on training data.
    train = train.copy()
    nan_mask = np.isnan(train)
    train[nan_mask] = train[~nan_mask].mean()

    # We will use separate priors for sigma and correlation matrix.
    # In order to convert the upper triangular correlation values to a
    # complete correlation matrix, we need to construct an index matrix:
    n_elem = int(dim * (dim - 1) / 2)
    tri_index = np.zeros([dim, dim], dtype=int)
    tri_index[np.triu_indices(dim, k=1)] = np.arange(n_elem)
    tri_index[np.triu_indices(dim, k=1)[::-1]] = np.arange(n_elem)

    logging.info('building the BPMF model')
    with pm.Model() as bpmf:
        # Specify user feature matrix
        sigma_u = pm.Uniform('sigma_u', shape=dim)
        corr_triangle_u = pm.LKJCorr('corr_u', n=1, p=dim, testval=np.random.randn(n_elem) * std)

        corr_matrix_u = corr_triangle_u[tri_index]
        corr_matrix_u = t.fill_diagonal(corr_matrix_u, 1)
        cov_matrix_u = t.diag(sigma_u).dot(corr_matrix_u.dot(t.diag(sigma_u)))
        lambda_u = t.nlinalg.matrix_inverse(cov_matrix_u)

        mu_u = pm.Normal('mu_u', mu=0, tau=beta_0 * t.diag(lambda_u), shape=dim,testval=np.random.randn(dim) * std)
        U = pm.MvNormal('U', mu=mu_u, tau=lambda_u, shape=(n, dim),testval=np.random.randn(n, dim) * std)

        # Specify item feature matrix
        sigma_v = pm.Uniform('sigma_v', shape=dim)
        corr_triangle_v = pm.LKJCorr('corr_v', n=1, p=dim,testval=np.random.randn(n_elem) * std)

        corr_matrix_v = corr_triangle_v[tri_index]
        corr_matrix_v = t.fill_diagonal(corr_matrix_v, 1)
        cov_matrix_v = t.diag(sigma_v).dot(corr_matrix_v.dot(t.diag(sigma_v)))
        lambda_v = t.nlinalg.matrix_inverse(cov_matrix_v)

        mu_v = pm.Normal('mu_v', mu=0, tau=beta_0 * t.diag(lambda_v), shape=dim,testval=np.random.randn(dim) * std)
        V = pm.MvNormal( 'V', mu=mu_v, tau=lambda_v, shape=(m, dim),testval=np.random.randn(m, dim) * std)

        # Specify rating likelihood function
        R = pm.Normal('R', mu=t.dot(U, V.T), tau=alpha * np.ones((n, m)),observed=train)

    logging.info('done building the BPMF model')
    return bpmf

def getHitRatio(ranklist, targetItem):
    for item in ranklist:
        if item == targetItem:
            return 1
    return 0
def getNDCG(ranklist, targetItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == targetItem:
            return math.log(2) / math.log(i+2)
    return 0


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO,format='[%(asctime)s]: %(message)s')

    # Read data and build PMF model.
    train = getTraindata()
    bpmf = build_mod_bpmf_model(train, dim=8)#dim is the number of latent factors

    with bpmf:# sample with BPMF
        tstart = time.time()
        logging.info('Starting BPMF training')
        #start = pm.find_MAP()    
        step = pm.NUTS()
        #trace = pm.sample(1000, step, start=start)
        trace = pm.sample(100, step)
        elapsed = time.time() - tstart    
        logging.info('Completed BPMF in %d seconds' % int(elapsed))
        
    with bpmf:#evaluation
        testset = getTestdata()
        ppc = pm.sample_posterior_predictive(trace, progressbar=True)
        nR = np.mean(ppc['R'],0)#three dims, calcuate the mean with the first dim for posterior
        hits = []
        ndcgs = []
        prev_u = testset[0][0]
        pos_i = testset[0][1]
        scorelist = []
        for u, i in testset:
            if prev_u == u:
                scorelist.append([i,nR[u,i]])
            else:
                map_item_score = {}
                for item, rate in scorelist: #turn dict
                    map_item_score[item] = rate
                ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
                hr = getHitRatio(ranklist, pos_i)
                hits.append(hr)
                ndcg = getNDCG(ranklist, pos_i)
                ndcgs.append(ndcg)
                #next user
                scorelist = []
                prev_u = u
                pos_i = i
                scorelist.append([i,nR[u,i]])
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        print("hr: {}, NDCG: {}, At K {}".format(hitratio, ndcg, 8))

训练一直卡在:

Loading Success!
Data Info:
	User Num: 6039
	Item Num: 3705
	Data Size: 994169
[2019-07-23 07:26:00,509]: building the BPMF model
[2019-07-23 07:26:21,704]: done building the BPMF model
[2019-07-23 07:26:21,709]: finding PMF MAP using Powell optimization
Only 100 samples in chain.
[2019-07-23 07:26:40,130]: Only 100 samples in chain.
Multiprocess sampling (4 chains in 4 jobs)
[2019-07-23 07:26:40,147]: Multiprocess sampling (4 chains in 4 jobs)
NUTS: [V, mu_v, corr_v, sigma_v, U, mu_u, corr_u, sigma_u]
[2019-07-23 07:26:40,153]: NUTS: [V, mu_v, corr_v, sigma_v, U, mu_u, corr_u, sigma_u]
Sampling 4 chains:   0%|          | 12/2400 [01:47<10:26:46, 15.75s/draws]

BPMF是用贝叶斯MCMC推断方法求解MF概率模型,和笔者下一篇BMF模型思路一致。

你可能感兴趣的:(Algorithm)