@数据分析-分类-案列-糖尿病数据集
决策树分类: tree.DecisionTreeClassifier
K近邻分类: neighbors.KNeighborsClassifier
Bernoulli 贝叶斯: naive_bayes.BernoulliNB
Gaussian 贝叶斯: naive_bayes.GaussianNB
多项式贝叶斯:naive_bayes.MultinomialNB
支持向量分类器:svm.SVC 支持向量分类器
线性支持向量分类器:svm.LinearSVC 线性支持向量分类器
逻辑回归: linear_model.LogisticRegression
糖尿病数据集,来源UCI
特征:怀孕次数,血糖,血压,皮脂厚度,胰岛素,BMI身体质量指数,糖尿病遗传函数,年龄。
目标:根据(怀孕次数,血糖,血压,皮脂厚度,胰岛素,BMI身体质量指数,糖尿病遗传函数,年龄),预测是否患糖尿病。
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pydotplus
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
data=pd.read_csv("d:/datasets/diabetes.csv")
X=data.iloc[:-1] #提取特征
Y=data.iloc[-1] #提取标签
X_train,X_test,y_train,y_test= train_test_split(X,Y,test_size = 0.3) #训练集测试集划分
LR = LogisticRegression(random_state=10) ## 模型构建
LR.fit(X_train,y_train) # 模型训练
y_pred = LR.predict(X_test) # 预测
print("输出正确率:",round(LR.score(X_test, y_test),4))
# 输出classification_report
print(classification_report(y_test, y_pred))
sns.set(font='SimHei') # 正常显示中文
# 绘制热力图
ax = sns.heatmap(confusion_matrix(y_test, y_pred),
annot=True, fmt='d',
xticklabels=["acc(0)","unacc(1)"],
yticklabels=["acc(0)","unacc(1)"])
ax.set_ylabel('True')
ax.set_xlabel('predict')
ax.set_title('confusion_metrix')
#模型构建、训练与预测
# 模型构建KNeighborsClassifier
KNC = KNeighborsClassifier()
# 训练
KNC.fit(X_train, y_train)
KNC.predict(X_test)
# 建立DecisionTreeClassifier模型与训练
DTC = DecisionTreeClassifier()
DTC.fit(X_train, y_train)
DTC.predict(X_test)
out_file='out_tree.dot'
export_graphviz(DTC, out_file=out_file,
feature_names=train_x.columns,
rounded=True, filled=True,
class_names=['acc','unacc'])
# 使用dot文件构造图
graph= pydotplus.graph_from_dot_file(out_file)
Image(graph.create_png())
# 模型GaussianNB构建与训练
GNB= GaussianNB()
GNB.fit(X_train, y_train)
...
# 模型BernoulliNB构建与训练
BNB= BernoulliNB()
BNB.fit(X_train, y_train)
...
# 模型MultinomialNB构建与训练
MNB= MultinomialNB()
MNB.fit(X_train, y_train)
...
# 模型LinearSVC构建与训练
LSVC = LinearSVC()
LSVC.fit(X_train, y_train)
...
# 模型SVC构建与训练
SVC = SVC()
SVC.fit(X_train, y_train)
SVC.predict(X_test)
...