x_train , y_train :
在一张表里面。确定x,y 的时候会用到 drop函数
对整个矩阵 删去 ,y的那一列,剩下的都是x
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split #训练集 ,测试集分类
#特征工程重要三个插件
import eli5
from eli5.sklearn import PermutationImportance
import shap #对比多个/所有特征对模型起到抑制和促进
from pdpbox import pdp, info_plots
np.random.seed(123)#跟random_state是一样的,第一次运行的时候,后面的结果是不会变的
```python
dt = pd.read_csv('C:/Users/lb/Desktop/test/heart.csv')
dt.head()
dt.info() #未发现存在缺失值
#列名可以根据自己易读性修改
dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
'exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']
#转换一下各个特征的属性,后面用astype强制转换回来
dt['sex'][dt['sex'] == 0] = 'female'
dt['sex'][dt['sex'] == 1] = 'male'
# 胸痛经历
dt['chest_pain_type'][dt['chest_pain_type'] == 1] = 'typical angina' #典型心绞痛
dt['chest_pain_type'][dt['chest_pain_type'] == 2] = 'atypical angina' #非典型心绞痛
dt['chest_pain_type'][dt['chest_pain_type'] == 3] = 'non-anginal pain' #非心绞痛
dt['chest_pain_type'][dt['chest_pain_type'] == 4] = 'asymptomatic' #无症状
#病人的静息血压
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml' #低压
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'#高压
#心电图测量
dt['rest_ecg'][dt['rest_ecg'] == 0] = 'normal'#正常
dt['rest_ecg'][dt['rest_ecg'] == 1] = 'ST-T wave abnormality' #有ST-T波异常
dt['rest_ecg'][dt['rest_ecg'] == 2] = 'left ventricular hypertrophy'#按Estes标准显示可能或明确的左心室肥厚
#运动诱发心绞痛
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 0] = 'no' #否
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 1] = 'yes' #是
#峰值运动后ST段心电图的斜率
dt['st_slope'][dt['st_slope'] == 1] = 'upsloping'#上升
dt['st_slope'][dt['st_slope'] == 2] = 'flat' #平坦
dt['st_slope'][dt['st_slope'] == 3] = 'downsloping' #下降
#称为地中海贫血的血液疾病
dt['thalassemia'][dt['thalassemia'] == 1] = 'normal' #正常
dt['thalassemia'][dt['thalassemia'] == 2] = 'fixed defect' #固定
dt['thalassemia'][dt['thalassemia'] == 3] = 'reversable defect'#可逆缺陷
#原来数据是正常的都是数值特征,中间转换为str类型了 , 用astype再强制转换回来
dt['sex'] = dt['sex'].astype('object')# 现在是”sex“是female为女,male为男 ,现在强反转过来男为1 ,女为0
dt['chest_pain_type'] = dt['chest_pain_type'].astype('object') #现在都强制反转换过来1 2 3 4
dt['fasting_blood_sugar'] = dt['fasting_blood_sugar'].astype('object')
dt['rest_ecg'] = dt['rest_ecg'].astype('object')
dt['exercise_induced_angina'] = dt['exercise_induced_angina'].astype('object')
dt['st_slope'] = dt['st_slope'].astype('object')
dt['thalassemia'] = dt['thalassemia'].astype('object')
#pandas的读热编码
dt = pd.get_dummies(dt)#drop_first=True读热编码,#drop_first表示去除one-hot编码后的第一列数据,反之就有第一列
dt.head()
#实际模型使用的时候不会使用,random. =1 或者 0 会用一个大的数字
#比如 10 加入 100 个数据集 , 80 个做训练集 ,20个测试集 ,那么在每次分割训练集的时候测试集总要10个数据不会变动
#
x_train, x_test, y_train, y_test = train_test_split(dt.drop('target',axis = 1), dt['target'], test_size=0.2,random_state =10)
#利用随机森林模型进行训练 ,这种选择数的深度是 5
model = RandomForestClassifier(max_depth=5)
model.fit(x_train, y_train)
#bootstrap=True是否使用bootstrap,默认是true,自助法,有放回的重采样
#“balanced” 模式自动调整权重,每类的权重为 n_samples / (n_classes * np.bincount(y)),即类别数的倒数除以每类样本数的占比。
#树分裂的规则:gini系数,entropy熵,默认的是基尼系数
#max_depth=5:树的深度为5层
"""max_features='auto':int, float, string or None, optional (default=”auto”)
查找最佳分裂所需考虑的特征数,
int:分裂的最大特征数,
float:分裂的特征占比,
auto、sqrt:sqrt(n_features),
log2:log2(n_features),
None:n_features,
"""
#max_leaf_nodes=None 最大叶子节点数;
#min_impurity_decrease=0 分裂的最小不纯度为0
#n_estimators:随机森林中树的数量
#n_jobs : integer, optional (default=1),并行job数,-1 代表全部
#oob_score : bool (default=False),是否使用袋外(out-of-bag)样本估计准确度;
#random_state=None ,随机数种子,保持下一次运行不变
#verbose:控制树冗余
#warm_start : bool, optional (default=False),如果设置为True,在之前的模型基础上预测并添加模型,否则,建立一个全新的森林;
print(model) #随机森林的参数
#graphviz 手动安装 ,这是一个模板,需要填的就填好了
#proportion=True ,设置均匀
#filled:装满
#feature_names特征名称,已定义
#feature_names = [i for i in x_train.columns]
#y 就是target 那一列 之前是 0 或者 1 ,现在我想修改为 字符串,强制转换格式
# y_train_str = y_train.astype('str')
# y_train_str[y_train_str == '0'] = 'no disease' #0代表没心脏病
# y_train_str[y_train_str == '1'] = 'disease' #1代表有心脏病
# y_train_str = y_train_str.values
export_graphviz(estimator, out_file='tree.dot',
feature_names = feature_names, #特征变量 ,已被定义
class_names = y_train_str, # 类别变量,已被定义
rounded = True, proportion = True, #树节点为圆角矩形
label='root',
precision = 2, filled = True) #precision=2:每个节点的杂质,阈值和值属性的值中浮点数的精度位数; filled:充满
# # 使用系统命令转换为png(需要Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# # dot:生成可视化图片的命令
# #-Tpng:指定图像类型是png
# #tree.dot:out_file输出的文件名
# #-o:output输出文件
# #tree.png:输出文件名
# #-Gdpi=600:图像每英寸含600个像素
# 显示在jupyter笔记本
from IPython.display import Image
Image(filename = 'tree.png')
训练完模型之后 可以使用三个插件
#把随机森林加载进来 ,下一次运行结果不变
perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
# 要求集成算法的特征重要度 , 把所有特征加载进来
eli5.show_weights(perm, feature_names = X_test.columns.tolist())
#第一行的心绞痛的经历权重很高跟是否得心脏病很重要,中间的非心绞痛就跟心脏病和正常的贫血跟没关系
feat_name = 'age'
pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
#随着年龄的升高,心脏病越小,但是37到42岁是危险的年龄,心脏病越高
#chest_pain_type:心绞痛从蓝变红,越来越大,代表越来越严重
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test)
def heart_disease_risk_factors(model, patient):
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(patient)#shap_values是所有的测试特征
shap.initjs()#显示格式转换
return shap.force_plot(explainer.expected_value[1], shap_values[1], patient) #patient患者
data_for_prediction = X_test.iloc[1,:].astype(float)#把测试样本中,第一行的所有特征拿到都强制转为”float“
heart_disease_risk_factors(model, data_for_prediction)
#图中红色的chest_pain_type = 2非典型心绞痛对没有患心脏病的强度很大;蓝色的num_magor_vessels=1血管数量越少,对换心脏病的强度越高