肿瘤预测(决策树)
【实验内容】
基于威斯康辛乳腺癌数据集,采用决策树的方法进行肿瘤预测。
【实验要求】
1.加载sklearn自带的威斯康星乳腺癌数据集,探索数据。
2.进行数据集分割。
3.配置决策树模型。
4.训练决策树模型。
5.模型预测。
6.模型评估。
7.参数调优。可以根据评估结果,对模型设置或调整为更优的参数,使评估结果更准确。
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree # 导入决策树包
## 加载sklearn自带的威斯康星乳腺癌数据集,探索数据
cancers = load_breast_cancer()
cancers
## 进行数据集分割
x_train, x_test, y_train, y_test = train_test_split(
cancers.data, cancers.target, test_size=0.30)
print("x_train.shape:", x_train.shape)
print("y_train.shape:", y_train.shape)
print("x_test.shape:", x_test.shape)
print("y_test.shape:", y_test.shape)
## 配置决策树模型
clf = tree.DecisionTreeClassifier() #加载决策树模型
## 训练决策树模型
clf.fit(x_train, y_train)
## 模型预测
predictions = clf.predict(x_test)
## 模型评估
from sklearn.metrics import accuracy_score # 导入准确率评价指标
print('Accuracy:%s'% accuracy_score(y_test, predictions))
## 参数调优
### criterion
clf = tree.DecisionTreeClassifier(criterion = 'entropy') #更换criterion参数
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
print('Accuracy:%s'% accuracy_score(y_test, predictions))
### max_depth
clf = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=2)
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
print('Accuracy:%s'% accuracy_score(y_test, predictions))
x_train.shape: (398, 30)
y_train.shape: (398,)
x_test.shape: (171, 30)
y_test.shape: (171,)
Accuracy:0.9181286549707602
Accuracy:0.9005847953216374
Accuracy:0.9239766081871345
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from jupyterthemes import jtplot
jtplot.style(theme='monokai')
加载sklearn自带的威斯康星乳腺癌数据集,探索数据
cancers = load_breast_cancer()
cancers
{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
1.189e-01],
[2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
8.902e-02],
[1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
8.758e-02],
...,
[1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
7.820e-02],
[2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
1.240e-01],
[7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
7.039e-02]]),
'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1]),
'frame': None,
'target_names': array(['malignant', 'benign'], dtype='
进行数据集分割
x_train, x_test, y_train, y_test = train_test_split(
cancers.data, cancers.target, test_size=0.30)
print("x_train.shape:", x_train.shape)
print("y_train.shape:", y_train.shape)
print("x_test.shape:", x_test.shape)
print("y_test.shape:", y_test.shape)
x_train.shape: (398, 30)
y_train.shape: (398,)
x_test.shape: (171, 30)
y_test.shape: (171,)
配置决策树模型
clf = tree.DecisionTreeClassifier()
训练决策树模型
clf.fit(x_train, y_train)
DecisionTreeClassifier()
模型预测
predictions = clf.predict(x_test)
模型评估
from sklearn.metrics import accuracy_score
print('Accuracy:%s'% accuracy_score(y_test, predictions))
Accuracy:0.9122807017543859
参数调优
criterion
clf = tree.DecisionTreeClassifier(criterion = 'entropy')
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
print('Accuracy:%s'% accuracy_score(y_test, predictions))
Accuracy:0.9239766081871345
max_depth
clf = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=2)
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
print('Accuracy:%s'% accuracy_score(y_test, predictions))
Accuracy:0.9415204678362573