fm代码

1. https://www.jianshu.com/p/610dff83f709

import numpy as np
from random import normalvariate
# from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import MinMaxScaler as MM
import pandas as pd
data_train = pd.read_csv('diabetes_train.txt', header=None)
data_test = pd.read_csv('diabetes_test.txt', header=None)


def preprocessing(data_input):
    standardopt = MM()
    data_input.iloc[:, -1].replace(0, -1, inplace=True)
    feature = data_input.iloc[:, :-1]
    feature = standardopt.fit_transform(feature)
    feature = np.mat(feature)#传回来的是array,如果要dataframe那用dataframe
    label = np.array(data_input.iloc[:, -1])
    return feature, label


def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))


def sgd_fm(datamatrix, label, k, iter, alpha):
    '''
    k:分解矩阵的长度
    '''
    m, n = np.shape(datamatrix)
    w0 = 0.0
    w = np.zeros((n, 1))
    v = normalvariate(0, 0.2) * np.ones((n, k))
    for it in range(iter):
        for i in range(m):
            # inner1 = datamatrix[i] * w
            inner1 = datamatrix[i] * v
            inner2 = np.multiply(datamatrix[i], datamatrix[i]) * np.multiply(v, v)
            jiaocha = np.sum((np.multiply(inner1, inner1) - inner2), axis=1) / 2.0
            ypredict = w0 + datamatrix[i] * w + jiaocha
            # print(np.shape(ypredict))
            # print(ypredict[0, 0])
            yp = sigmoid(label[i]*ypredict[0, 0])
            loss = 1 - (-(np.log(yp)))
            w0 = w0 - alpha * (yp - 1) * label[i] * 1
            for j in range(n):
                if datamatrix[i, j] != 0:
                    w[j] = w[j] - alpha * (yp - 1) * label[i] * datamatrix[i, j]
                    for k in range(k):
                        v[j, k] = v[j, k] - alpha * ((yp - 1) * label[i] * \
                                  (datamatrix[i, j] * inner1[0, k] - v[j, k] * \
                                  datamatrix[i, j] * datamatrix[i, j]))
        print('第%s次训练的误差为:%f' % (it, loss))
    return w0, w, v


def predict(w0, w, v, x, thold):
    inner1 = x * v
    inner2 = np.multiply(x, x) * np.multiply(v, v)
    jiaocha = np.sum((np.multiply(inner1, inner1) - inner2), axis=1) / 2.0
    ypredict = w0 + x * w + jiaocha
    y0 = sigmoid(ypredict[0,0])
    if y0 > thold:
        yp = 1
    else:
        yp = -1
    return yp


def calaccuracy(datamatrix, label, w0, w, v, thold):
    error = 0
    for i in range(np.shape(datamatrix)[0]):
        yp = predict(w0, w, v, datamatrix[i], thold)
        if yp != label[i]:
            error += 1
    accuray = 1.0 - error/np.shape(datamatrix)[0]
    return accuray

datamattrain, labeltrain = preprocessing(data_train)
datamattest, labeltest = preprocessing(data_test)
w0, w, v = sgd_fm(datamattrain, labeltrain, 20, 300, 0.01)
maxaccuracy = 0.0
tmpthold = 0.0
for i in np.linspace(0.4, 0.6, 201):
    print(i)
    accuracy_test = calaccuracy(datamattest, labeltest, w0, w, v, i)
    if accuracy_test > maxaccuracy:
        maxaccuracy = accuracy_test
        tmpthold = i
print(accuracy_test, tmpthold)

2. https://www.pianshen.com/article/4561285296/

 

3. https://github.com/jizhihui/fm_python

https://www.cnblogs.com/AndyJee/p/8032553.html

from __future__ import division
from math import exp
import numpy as np
from numpy import *
from random import normalvariate#正态分布
from datetime import datetime

trainData = 'diabetes_train.txt'
testData = 'diabetes_test.txt'
featureNum = 8
max_list = []
min_list = []

def normalize(x_list,max_list,min_list):
    index = 0
    scalar_list = []
    for x in x_list:
        x_max = max_list[index]
        x_min = min_list[index]
        if x_max == x_min:
            x = 1.0    
        else:
            x = round((x-x_min)/(x_max-x_min),4)
        scalar_list.append(x)
        index += 1
    return scalar_list

def loadTrainDataSet(data):
    global max_list
    global min_list
    dataMat = []
    labelMat = []
    
    fr = open(data)#打开文件
    
    for line in fr.readlines():
        currLine = line.strip().split(',')
        #lineArr = [1.0]
        lineArr = []
        
        for i in xrange(featureNum):
            lineArr.append(float(currLine[i]))

        dataMat.append(lineArr)
        
        labelMat.append(float(currLine[-1]) * 2 - 1)
    
    data_array = np.array(dataMat)
    max_list = np.max(data_array,axis=0)
    min_list = np.min(data_array,axis=0)

    scalar_dataMat = []
    for row in dataMat:
        scalar_row = normalize(row,max_list,min_list)
        scalar_dataMat.append(scalar_row)
    return scalar_dataMat, labelMat

def loadTestDataSet(data):
    global max_list
    global min_list
    dataMat = []
    labelMat = []
    
    fr = open(data)#打开文件
    
    for line in fr.readlines():
        currLine = line.strip().split(',')
        lineArr = []
        
        for i in xrange(featureNum):
            lineArr.append(float(currLine[i]))

        dataMat.append(lineArr)
        
        labelMat.append(float(currLine[-1]) * 2 - 1)
    
    data_array = np.array(dataMat)

    scalar_dataMat = []
    for row in dataMat:
        scalar_row = normalize(row,max_list,min_list)
        scalar_dataMat.append(scalar_row)
    return scalar_dataMat, labelMat

def sigmoid(inx):
    return 1. / (1. + exp(-max(min(inx, 15.), -15.)))
    #return 1.0 / (1 + exp(-inx))

def stocGradAscent(dataMatrix, classLabels, k, iter):
    #dataMatrix用的是mat, classLabels是列表
    m, n = shape(dataMatrix)
    alpha = 0.01
    #初始化参数
    #w = random.randn(n, 1)#其中n是特征的个数
    w = zeros((n, 1))
    w_0 = 0.
    v = normalvariate(0, 0.2) * ones((n, k))
    
    for it in xrange(iter):
        print it
        for x in xrange(m):#随机优化,对每一个样本而言的
            inter_1 = dataMatrix[x] * v
            inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)#multiply对应元素相乘
            #完成交叉项
            interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
            
            p = w_0 + dataMatrix[x] * w + interaction#计算预测的输出
            #print "y: ",p 
            loss = sigmoid(classLabels[x] * p[0, 0]) - 1
            #print "loss: ",loss
        
            w_0 = w_0 - alpha * loss * classLabels[x]
            
            for i in xrange(n):
                if dataMatrix[x, i] != 0:
                    w[i, 0] = w[i, 0] - alpha * loss * classLabels[x] * dataMatrix[x, i]
                    for j in xrange(k):
                        v[i, j] = v[i, j] - alpha * loss * classLabels[x] * (dataMatrix[x, i] * inter_1[0, j] - v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])
        
    
    return w_0, w, v

def getAccuracy(dataMatrix, classLabels, w_0, w, v):
    m, n = shape(dataMatrix)
    allItem = 0
    error = 0
    result = []
    for x in xrange(m):
        allItem += 1
        inter_1 = dataMatrix[x] * v
        inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)#multiply对应元素相乘
        #完成交叉项
        interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
        p = w_0 + dataMatrix[x] * w + interaction#计算预测的输出
        
        pre = sigmoid(p[0, 0])
        
        result.append(pre)
        
        if pre < 0.5 and classLabels[x] == 1.0:
            error += 1
        elif pre >= 0.5 and classLabels[x] == -1.0:
            error += 1
        else:
            continue
        
    
    print result
    
    return float(error) / allItem
        
   
if __name__ == '__main__':
    dataTrain, labelTrain = loadTrainDataSet(trainData)
    dataTest, labelTest = loadTestDataSet(testData)
    date_startTrain = datetime.now()
    print "开始训练"
    w_0, w, v = stocGradAscent(mat(dataTrain), labelTrain, 20, 200)
    print "训练准确性为:%f" % (1 - getAccuracy(mat(dataTrain), labelTrain, w_0, w, v))
    date_endTrain = datetime.now()
    print "训练时间为:%s" % (date_endTrain - date_startTrain)
    print "开始测试"
    print "测试准确性为:%f" % (1 - getAccuracy(mat(dataTest), labelTest, w_0, w, v))  

4. https://blog.csdn.net/lieyingkub99/article/details/80897743/

5. https://blog.csdn.net/weixin_45459911/article/details/105953742

# coding:UTF-8

from __future__ import division
from math import exp
from numpy import *
from random import normalvariate  # 正态分布
from datetime import datetime

trainData = 'E://data//diabetes_train.txt'
testData = 'E://data//diabetes_test.txt'
featureNum = 8


def loadDataSet(data):  # 对数据进行处理
    dataMat = []
    labelMat = []

    fr = open(data)  # 打开文件

    for line in fr.readlines():  # readlines()用于读取所有行并返回列表,该列表可以由 Python 的 for... in ... 结构进行处理。如果碰到结束符EOF则返回空字符串。
        currLine = line.strip().split()  # 该语法作用是除掉两边和中间空格,返回一个字符串列表
        # lineArr = [1.0]
        lineArr = []

        for i in range(featureNum):
            lineArr.append(float(currLine[i + 1]))  # 将字符串列表放到lineArr中
        dataMat.append(lineArr)  # # 最终将处理完的数据放到dataMat中

        labelMat.append(float(currLine[0]) * 2 - 1)
    return dataMat, labelMat


def sigmoid(inx):  # 激活函数sigmoid
    return 1.0 / (1 + exp(-inx))


def stocGradAscent(dataMatrix, classLabels, k, iter):  # 参数更新
    # dataMatrix用的是mat, classLabels是列表   mat函数用于将数组转换为矩阵
    m, n = shape(dataMatrix)
    alpha = 0.01
    # 初始化参数
    w = zeros((n, 1))  # 其中n是特征的个数
    w_0 = 0.
    v = normalvariate(0, 0.2) * ones((n, k))  # normalvariate用于生成正态分布函数

    for it in range(iter):
        print
        it
        for x in range(m):  # 随机优化,对每一个样本而言的
            inter_1 = dataMatrix[x] * v
            inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)  # multiply对应元素相乘
            # 完成交叉项
            interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.

            p = w_0 + dataMatrix[x] * w + interaction  # 计算预测的输出

            loss = sigmoid(classLabels[x] * p[0, 0]) - 1
            print
            loss

            w_0 = w_0 - alpha * loss * classLabels[x]

            for i in range(n):
                if dataMatrix[x, i] != 0:
                    w[i, 0] = w[i, 0] - alpha * loss * classLabels[x] * dataMatrix[x, i]
                    for j in range(k):
                        v[i, j] = v[i, j] - alpha * loss * classLabels[x] * (
                                    dataMatrix[x, i] * inter_1[0, j] - v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])

    return w_0, w, v


def getAccuracy(dataMatrix, classLabels, w_0, w, v):
    m, n = shape(dataMatrix)
    allItem = 0
    error = 0
    result = []
    for x in range(m):
        allItem += 1
        inter_1 = dataMatrix[x] * v
        inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)  # multiply对应元素相乘
        # 完成交叉项
        interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
        p = w_0 + dataMatrix[x] * w + interaction  # 计算预测的输出

        pre = sigmoid(p[0, 0])

        result.append(pre)

        if pre < 0.5 and classLabels[x] == 1.0:
            error += 1
        elif pre >= 0.5 and classLabels[x] == -1.0:
            error += 1
        else:
            continue

    print
    result

    return float(error) / allItem


if __name__ == '__main__':
    dataTrain, labelTrain = loadDataSet(trainData)
    dataTest, labelTest = loadDataSet(testData)
    date_startTrain = datetime.now()  # 读取系统本地时间
    print
    "开始训练"
    w_0, w, v = stocGradAscent(mat(dataTrain), labelTrain, 20, 200)
    print
    "训练准确性为:%f" % (1 - getAccuracy(mat(dataTrain), labelTrain, w_0, w, v))
    date_endTrain = datetime.now()
    print
    "训练时间为:%s" % (date_endTrain - date_startTrain)
    print
    "开始测试"
    print
    "测试准确性为:%f" % (1 - getAccuracy(mat(dataTest), labelTest, w_0, w, v))

 

你可能感兴趣的:(推荐算法)