Class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280/OD315_of_diluted_wines,Proline
1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050
1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185
1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480
1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735
1,14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450
1,14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290
如果不采用文件读取的方式可以直接通过pandas构造DataFrame或者numpy直接构造np.array([[1,2,3,…],[…]…]),我这里选择的读取csv数据集的方式
wine.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.externals import joblib
import numpy as np
'''
@Author :王磊
@Date :2018/12/15
@Description:基于酒数据集K-means聚类模型、K临近算法模型和SVM分类模型进行建模、评估与数据预测
'''
class wine:
def getCsv(self, path):
'''
通过路径读取csv文件
:param path:csv文件路径
:return:DataFrame
'''
return pd.read_csv(path)
def getTrainData(self):
'''
切割数据集,同时进行标准化处理,返回训练与测试需要的特征值和目标值
:return:None
'''
# 读取csv文件
wineCsv = self.getCsv("../data/wine.csv")
# 获取目标数据
y_data = wineCsv['Class']
# 获取特征数据(源数据删除类别数据)
# axis = 1 以列的方式进行删除
x_data = wineCsv.drop("Class", axis=1)
# 分割数据,75%为训练集合,25%为测试集合
# 参数为变长参数,可以目标值特征值一起处理或者是分别处理
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)
# y_train, y_test = train_test_split(y_data, test_size=0.25)
# x_train, x_test = train_test_split(x_data, test_size=0.25)
# 特征工程(标准化数据,防止部分数据过大或者过小等问题造成数据权重失效)
# 这里只用对特征数据处理,目标数据无需特征化
sd = StandardScaler()
x_train = sd.fit_transform(x_train)
x_test = sd.fit_transform(x_test)
return x_train, x_test, y_train, y_test
def k_Means(self):
'''
K-Means算法实现建模与评估预测
:return:None
'''
# 获取源数据
x_train, x_test, y_train, y_test = self.getTrainData()
# 初始化聚类构造器
# 由于这里酒有三个种类,所以设置聚类数为3
# K-Means算法
kmean = KMeans(n_clusters=3)
# 对训练数据进行聚类生成模型
km_model = kmean.fit(x_train, y_train)
# 模型持久化
joblib.dump(km_model, "c:/Users/asus/Desktop/data/python/model/km.model")
# 通过模型对测试数据进行结果预测
y_predict = kmean.predict(x_test)
y_predict += 1
# 同时给测试特征数据与目标数据得出准确率
score = kmean.score(x_test, y_test)
print("以下为K-Means算法结果:")
print("测试集合的预测结果为:" + str(y_predict))
print("准确率为:" + str(score))
def KNeighbors(self):
'''
K临近算法实现建模与评估预测
:return:None
'''
# 获取源数据
x_train, x_test, y_train, y_test = self.getTrainData()
# 初始化聚类构造器
# 由于这里酒有三个种类,所以设置聚类数为3
# K近邻算法
kn = KNeighborsClassifier(n_neighbors=3)
# 对训练数据进行聚类生成模型
kn_model = kn.fit(x_train, y_train)
# 模型持久化
joblib.dump(kn_model, "c:/Users/asus/Desktop/data/python/model/knn.model")
# 通过模型对测试数据进行结果预测
y_predict_kn = kn.predict(x_test)
# 同时给测试特征数据与目标数据得出准确率
score_kn = kn.score(x_test, y_test)
print("以下为K近邻算法结果:")
print("测试集合的预测结果为:" + str(y_predict_kn))
print("准确率为:" + str(score_kn))
def svc_linear(self):
'''
svc实现建模与评估预测
:return:None
'''
# 获取源数据
x_train, x_test, y_train, y_test = self.getTrainData()
# 初始化聚类构造器
# 由于这里酒有三个种类,所以设置聚类数为3
# SVC
svc = SVC(kernel='linear')
# 对训练数据进行聚类生成模型
svc_model = svc.fit(x_train, y_train)
# 模型持久化
joblib.dump(svc_model, "c:/Users/asus/Desktop/data/python/model/svc.model")
# 通过模型对测试数据进行结果预测
y_predict_svc = svc.predict(x_test)
# 同时给测试特征数据与目标数据得出准确率
score_svc = svc.score(x_test, y_test)
print("以下为SVC部分结果:")
print("测试集合的预测结果为:" + str(y_predict_svc))
print("准确率为:" + str(score_svc))
def testByData(self, data, mtype=1):
'''
指定算法模型预测数据结果
:param data: np.array([[1,2,3,..],[...]..])
:param mtype: 1(K-Means算法模型)、2(K临近算法模型)、3(svc模型)
:return: None
'''
# 特征数据标准化处理
sd = StandardScaler()
test_data = sd.fit_transform(data)
path = "c:/Users/asus/Desktop/data/python/model/"
if mtype == 1:
path += "km.model"
elif mtype == 2:
path += "knn.model"
else:
path += "svc.model"
model = joblib.load(path)
predict = model.predict(test_data)
if mtype == 1:
predict += 1
print("预测结果为:" + str(predict))
if __name__ == '__main__':
w = wine()
# k-means算法建模
w.k_Means()
# k临近算法建模
w.KNeighbors()
# svc建模
w.svc_linear()
# 手动构建预测数据
testArr = np.array([
[14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.06, .28, 2.29, 5.64, 1.04, 3.92, 1065], # 1
[13.2, 1.78, 2.14, 11.2, 100, 2.65, 2.76, .26, 1.28, 4.38, 1.05, 3.4, 1050], # 1
[13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, .3, 2.81, 5.68, 1.03, 3.17, 1185], # 1
[13.49, 1.66, 2.24, 24, 87, 1.88, 1.84, .27, 1.03, 3.74, .98, 2.78, 472], # 2
[12.84, 2.96, 2.61, 24, 101, 2.32, .6, .53, .81, 4.92, .89, 2.15, 590], # 3
[13.73, 4.36, 2.26, 22.5, 88, 1.28, .47, .52, 1.15, 6.62, .78, 1.75, 520], # 3
[12.42, 1.61, 2.19, 22.5, 108, 2, 2.09, .34, 1.61, 2.06, 1.06, 2.96, 345], # 2
])
# 加载三个模型去进行结果预测
w.testByData(testArr, 1)
w.testByData(testArr, 2)
w.testByData(testArr, 3)