用代码来解决一个分类问题
数据集来自于github上的糖尿病患者数据集,进行了3:7的随机切分
对数据集进行切分的代码为
# 引入所需包
import numpy as np
import random
#读取数据函数,输入为数据文件名和训练、测试切分比率,返回为list类型的训练数据集和测试数据集
def loadData(fileName,ratio):
trainingData=[]
testData=[]
with open(fileName) as txtData:
lines=txtData.readlines()
for line in lines:
lineData=line.strip().split(',') #去除空白和逗号“,”
if random.random()<ratio: #数据集分割比例
trainingData.append(lineData) #训练数据集列表
else:
testData.append(lineData) #测试数据集列表
np.savetxt('train.txt', trainingData, delimiter=',',fmt = '%s')
np.savetxt('test.txt',testData, delimiter=',',fmt = '%s')
return trainingData,testData
iris_file='E:\diabetes.csv'
ratio=0.7
trainingData, testData=loadData(iris_file,ratio) # 加载文件,按一定比率切分为训练样本和测试样本
加上了很多注释,应该不难理解
# coding:UTF-8
from __future__ import division
from math import exp
from numpy import *
from random import normalvariate # 正态分布
from datetime import datetime
trainData = 'E://data//train.txt'
testData = 'E://data//test.txt'
featureNum = 8
def loadDataSet(data): # 对数据进行处理
dataMat = []
labelMat = []
fr = open(data) # 打开文件
for line in fr.readlines(): # readlines()用于读取所有行并返回列表,该列表可以由 Python 的 for... in ... 结构进行处理。如果碰到结束符EOF则返回空字符串。
currLine = line.strip().split() # 该语法作用是除掉两边和中间空格,返回一个字符串列表
# lineArr = [1.0]
lineArr = []
for i in range(featureNum):
lineArr.append(float(currLine[i + 1])) # 将字符串列表放到lineArr中
dataMat.append(lineArr) # # 最终将处理完的数据放到dataMat中
labelMat.append(float(currLine[0]) * 2 - 1)
return dataMat, labelMat
def sigmoid(inx): # 激活函数sigmoid
return 1.0 / (1 + exp(-inx))
def stocGradAscent(dataMatrix, classLabels, k, iter): # 参数更新
# dataMatrix用的是mat, classLabels是列表 mat函数用于将数组转换为矩阵
m, n = shape(dataMatrix)
alpha = 0.01
# 初始化参数
w = zeros((n, 1)) # 其中n是特征的个数
w_0 = 0.
v = normalvariate(0, 0.2) * ones((n, k)) # normalvariate用于生成正态分布函数
for it in range(iter):
print
it
for x in range(m): # 随机优化,对每一个样本而言的
inter_1 = dataMatrix[x] * v
inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v) # multiply对应元素相乘
# 完成交叉项
interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
p = w_0 + dataMatrix[x] * w + interaction # 计算预测的输出
loss = sigmoid(classLabels[x] * p[0, 0]) - 1
print
loss
w_0 = w_0 - alpha * loss * classLabels[x]
for i in range(n):
if dataMatrix[x, i] != 0:
w[i, 0] = w[i, 0] - alpha * loss * classLabels[x] * dataMatrix[x, i]
for j in range(k):
v[i, j] = v[i, j] - alpha * loss * classLabels[x] * (
dataMatrix[x, i] * inter_1[0, j] - v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])
return w_0, w, v
def getAccuracy(dataMatrix, classLabels, w_0, w, v):
m, n = shape(dataMatrix)
allItem = 0
error = 0
result = []
for x in range(m):
allItem += 1
inter_1 = dataMatrix[x] * v
inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v) # multiply对应元素相乘
# 完成交叉项
interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
p = w_0 + dataMatrix[x] * w + interaction # 计算预测的输出
pre = sigmoid(p[0, 0])
result.append(pre)
if pre < 0.5 and classLabels[x] == 1.0:
error += 1
elif pre >= 0.5 and classLabels[x] == -1.0:
error += 1
else:
continue
print
result
return float(error) / allItem
if __name__ == '__main__':
dataTrain, labelTrain = loadDataSet(trainData)
dataTest, labelTest = loadDataSet(testData)
date_startTrain = datetime.now() # 读取系统本地时间
print
"开始训练"
w_0, w, v = stocGradAscent(mat(dataTrain), labelTrain, 20, 200)
print
"训练准确性为:%f" % (1 - getAccuracy(mat(dataTrain), labelTrain, w_0, w, v))
date_endTrain = datetime.now()
print
"训练时间为:%s" % (date_endTrain - date_startTrain)
print
"开始测试"
print
"测试准确性为:%f" % (1 - getAccuracy(mat(dataTest), labelTest, w_0, w, v))