1. https://www.jianshu.com/p/610dff83f709
import numpy as np
from random import normalvariate
# from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import MinMaxScaler as MM
import pandas as pd
data_train = pd.read_csv('diabetes_train.txt', header=None)
data_test = pd.read_csv('diabetes_test.txt', header=None)
def preprocessing(data_input):
standardopt = MM()
data_input.iloc[:, -1].replace(0, -1, inplace=True)
feature = data_input.iloc[:, :-1]
feature = standardopt.fit_transform(feature)
feature = np.mat(feature)#传回来的是array,如果要dataframe那用dataframe
label = np.array(data_input.iloc[:, -1])
return feature, label
def sigmoid(x):
return 1.0/(1.0 + np.exp(-x))
def sgd_fm(datamatrix, label, k, iter, alpha):
'''
k:分解矩阵的长度
'''
m, n = np.shape(datamatrix)
w0 = 0.0
w = np.zeros((n, 1))
v = normalvariate(0, 0.2) * np.ones((n, k))
for it in range(iter):
for i in range(m):
# inner1 = datamatrix[i] * w
inner1 = datamatrix[i] * v
inner2 = np.multiply(datamatrix[i], datamatrix[i]) * np.multiply(v, v)
jiaocha = np.sum((np.multiply(inner1, inner1) - inner2), axis=1) / 2.0
ypredict = w0 + datamatrix[i] * w + jiaocha
# print(np.shape(ypredict))
# print(ypredict[0, 0])
yp = sigmoid(label[i]*ypredict[0, 0])
loss = 1 - (-(np.log(yp)))
w0 = w0 - alpha * (yp - 1) * label[i] * 1
for j in range(n):
if datamatrix[i, j] != 0:
w[j] = w[j] - alpha * (yp - 1) * label[i] * datamatrix[i, j]
for k in range(k):
v[j, k] = v[j, k] - alpha * ((yp - 1) * label[i] * \
(datamatrix[i, j] * inner1[0, k] - v[j, k] * \
datamatrix[i, j] * datamatrix[i, j]))
print('第%s次训练的误差为:%f' % (it, loss))
return w0, w, v
def predict(w0, w, v, x, thold):
inner1 = x * v
inner2 = np.multiply(x, x) * np.multiply(v, v)
jiaocha = np.sum((np.multiply(inner1, inner1) - inner2), axis=1) / 2.0
ypredict = w0 + x * w + jiaocha
y0 = sigmoid(ypredict[0,0])
if y0 > thold:
yp = 1
else:
yp = -1
return yp
def calaccuracy(datamatrix, label, w0, w, v, thold):
error = 0
for i in range(np.shape(datamatrix)[0]):
yp = predict(w0, w, v, datamatrix[i], thold)
if yp != label[i]:
error += 1
accuray = 1.0 - error/np.shape(datamatrix)[0]
return accuray
datamattrain, labeltrain = preprocessing(data_train)
datamattest, labeltest = preprocessing(data_test)
w0, w, v = sgd_fm(datamattrain, labeltrain, 20, 300, 0.01)
maxaccuracy = 0.0
tmpthold = 0.0
for i in np.linspace(0.4, 0.6, 201):
print(i)
accuracy_test = calaccuracy(datamattest, labeltest, w0, w, v, i)
if accuracy_test > maxaccuracy:
maxaccuracy = accuracy_test
tmpthold = i
print(accuracy_test, tmpthold)
2. https://www.pianshen.com/article/4561285296/
3. https://github.com/jizhihui/fm_python
https://www.cnblogs.com/AndyJee/p/8032553.html
from __future__ import division
from math import exp
import numpy as np
from numpy import *
from random import normalvariate#正态分布
from datetime import datetime
trainData = 'diabetes_train.txt'
testData = 'diabetes_test.txt'
featureNum = 8
max_list = []
min_list = []
def normalize(x_list,max_list,min_list):
index = 0
scalar_list = []
for x in x_list:
x_max = max_list[index]
x_min = min_list[index]
if x_max == x_min:
x = 1.0
else:
x = round((x-x_min)/(x_max-x_min),4)
scalar_list.append(x)
index += 1
return scalar_list
def loadTrainDataSet(data):
global max_list
global min_list
dataMat = []
labelMat = []
fr = open(data)#打开文件
for line in fr.readlines():
currLine = line.strip().split(',')
#lineArr = [1.0]
lineArr = []
for i in xrange(featureNum):
lineArr.append(float(currLine[i]))
dataMat.append(lineArr)
labelMat.append(float(currLine[-1]) * 2 - 1)
data_array = np.array(dataMat)
max_list = np.max(data_array,axis=0)
min_list = np.min(data_array,axis=0)
scalar_dataMat = []
for row in dataMat:
scalar_row = normalize(row,max_list,min_list)
scalar_dataMat.append(scalar_row)
return scalar_dataMat, labelMat
def loadTestDataSet(data):
global max_list
global min_list
dataMat = []
labelMat = []
fr = open(data)#打开文件
for line in fr.readlines():
currLine = line.strip().split(',')
lineArr = []
for i in xrange(featureNum):
lineArr.append(float(currLine[i]))
dataMat.append(lineArr)
labelMat.append(float(currLine[-1]) * 2 - 1)
data_array = np.array(dataMat)
scalar_dataMat = []
for row in dataMat:
scalar_row = normalize(row,max_list,min_list)
scalar_dataMat.append(scalar_row)
return scalar_dataMat, labelMat
def sigmoid(inx):
return 1. / (1. + exp(-max(min(inx, 15.), -15.)))
#return 1.0 / (1 + exp(-inx))
def stocGradAscent(dataMatrix, classLabels, k, iter):
#dataMatrix用的是mat, classLabels是列表
m, n = shape(dataMatrix)
alpha = 0.01
#初始化参数
#w = random.randn(n, 1)#其中n是特征的个数
w = zeros((n, 1))
w_0 = 0.
v = normalvariate(0, 0.2) * ones((n, k))
for it in xrange(iter):
print it
for x in xrange(m):#随机优化,对每一个样本而言的
inter_1 = dataMatrix[x] * v
inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)#multiply对应元素相乘
#完成交叉项
interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
p = w_0 + dataMatrix[x] * w + interaction#计算预测的输出
#print "y: ",p
loss = sigmoid(classLabels[x] * p[0, 0]) - 1
#print "loss: ",loss
w_0 = w_0 - alpha * loss * classLabels[x]
for i in xrange(n):
if dataMatrix[x, i] != 0:
w[i, 0] = w[i, 0] - alpha * loss * classLabels[x] * dataMatrix[x, i]
for j in xrange(k):
v[i, j] = v[i, j] - alpha * loss * classLabels[x] * (dataMatrix[x, i] * inter_1[0, j] - v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])
return w_0, w, v
def getAccuracy(dataMatrix, classLabels, w_0, w, v):
m, n = shape(dataMatrix)
allItem = 0
error = 0
result = []
for x in xrange(m):
allItem += 1
inter_1 = dataMatrix[x] * v
inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)#multiply对应元素相乘
#完成交叉项
interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
p = w_0 + dataMatrix[x] * w + interaction#计算预测的输出
pre = sigmoid(p[0, 0])
result.append(pre)
if pre < 0.5 and classLabels[x] == 1.0:
error += 1
elif pre >= 0.5 and classLabels[x] == -1.0:
error += 1
else:
continue
print result
return float(error) / allItem
if __name__ == '__main__':
dataTrain, labelTrain = loadTrainDataSet(trainData)
dataTest, labelTest = loadTestDataSet(testData)
date_startTrain = datetime.now()
print "开始训练"
w_0, w, v = stocGradAscent(mat(dataTrain), labelTrain, 20, 200)
print "训练准确性为:%f" % (1 - getAccuracy(mat(dataTrain), labelTrain, w_0, w, v))
date_endTrain = datetime.now()
print "训练时间为:%s" % (date_endTrain - date_startTrain)
print "开始测试"
print "测试准确性为:%f" % (1 - getAccuracy(mat(dataTest), labelTest, w_0, w, v))
4. https://blog.csdn.net/lieyingkub99/article/details/80897743/
5. https://blog.csdn.net/weixin_45459911/article/details/105953742
# coding:UTF-8
from __future__ import division
from math import exp
from numpy import *
from random import normalvariate # 正态分布
from datetime import datetime
trainData = 'E://data//diabetes_train.txt'
testData = 'E://data//diabetes_test.txt'
featureNum = 8
def loadDataSet(data): # 对数据进行处理
dataMat = []
labelMat = []
fr = open(data) # 打开文件
for line in fr.readlines(): # readlines()用于读取所有行并返回列表,该列表可以由 Python 的 for... in ... 结构进行处理。如果碰到结束符EOF则返回空字符串。
currLine = line.strip().split() # 该语法作用是除掉两边和中间空格,返回一个字符串列表
# lineArr = [1.0]
lineArr = []
for i in range(featureNum):
lineArr.append(float(currLine[i + 1])) # 将字符串列表放到lineArr中
dataMat.append(lineArr) # # 最终将处理完的数据放到dataMat中
labelMat.append(float(currLine[0]) * 2 - 1)
return dataMat, labelMat
def sigmoid(inx): # 激活函数sigmoid
return 1.0 / (1 + exp(-inx))
def stocGradAscent(dataMatrix, classLabels, k, iter): # 参数更新
# dataMatrix用的是mat, classLabels是列表 mat函数用于将数组转换为矩阵
m, n = shape(dataMatrix)
alpha = 0.01
# 初始化参数
w = zeros((n, 1)) # 其中n是特征的个数
w_0 = 0.
v = normalvariate(0, 0.2) * ones((n, k)) # normalvariate用于生成正态分布函数
for it in range(iter):
print
it
for x in range(m): # 随机优化,对每一个样本而言的
inter_1 = dataMatrix[x] * v
inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v) # multiply对应元素相乘
# 完成交叉项
interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
p = w_0 + dataMatrix[x] * w + interaction # 计算预测的输出
loss = sigmoid(classLabels[x] * p[0, 0]) - 1
print
loss
w_0 = w_0 - alpha * loss * classLabels[x]
for i in range(n):
if dataMatrix[x, i] != 0:
w[i, 0] = w[i, 0] - alpha * loss * classLabels[x] * dataMatrix[x, i]
for j in range(k):
v[i, j] = v[i, j] - alpha * loss * classLabels[x] * (
dataMatrix[x, i] * inter_1[0, j] - v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])
return w_0, w, v
def getAccuracy(dataMatrix, classLabels, w_0, w, v):
m, n = shape(dataMatrix)
allItem = 0
error = 0
result = []
for x in range(m):
allItem += 1
inter_1 = dataMatrix[x] * v
inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v) # multiply对应元素相乘
# 完成交叉项
interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
p = w_0 + dataMatrix[x] * w + interaction # 计算预测的输出
pre = sigmoid(p[0, 0])
result.append(pre)
if pre < 0.5 and classLabels[x] == 1.0:
error += 1
elif pre >= 0.5 and classLabels[x] == -1.0:
error += 1
else:
continue
print
result
return float(error) / allItem
if __name__ == '__main__':
dataTrain, labelTrain = loadDataSet(trainData)
dataTest, labelTest = loadDataSet(testData)
date_startTrain = datetime.now() # 读取系统本地时间
print
"开始训练"
w_0, w, v = stocGradAscent(mat(dataTrain), labelTrain, 20, 200)
print
"训练准确性为:%f" % (1 - getAccuracy(mat(dataTrain), labelTrain, w_0, w, v))
date_endTrain = datetime.now()
print
"训练时间为:%s" % (date_endTrain - date_startTrain)
print
"开始测试"
print
"测试准确性为:%f" % (1 - getAccuracy(mat(dataTest), labelTest, w_0, w, v))