# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm']/100)**2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14,'是否得病', d)
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm']/100)**2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14,'是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第一题只分析元数据即可
df = df3[['性别', '年龄', '身高cm', '体重kg', 'BMI','是否得病']]
# step3 模型建立
dataset = pd.get_dummies(df, columns=['性别', '是否得病']) # 将分类变量转成独热变量
# print(dataset.head(5))
standardScaler = StandardScaler()
columns_to_scale = ['年龄', '身高cm', '体重kg','BMI' ]
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
X = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=100)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)
# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1)) # 混淆矩阵
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
y_test = numpy.array(y_test)
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1)) # 混淆矩阵
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
[[1102 16]
[ 17 1102]]
训练集准确率: 0.9852481001341081
[[261 221]
[239 239]]
测试集准确率: 0.5166666666666667
注意SVM 的数据集的类别分布不能相差guo
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1)
# print(df1) # [4923 rows x 13 columns] 读取阴性 数量太多导致数据不平衡
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm']/100)**2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14,'是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第一题只分析元数据即可
df = df3[['身高cm','年龄', '体重kg', 'BMI','是否得病']]
# step3 模型建立
# dataset = pd.get_dummies(df, columns=['性别', '是否得病']) # 将分类变量转成独热变量
dataset = df
# print(dataset.head(5))
standardScaler = StandardScaler()
columns_to_scale = ['身高cm','年龄', '体重kg', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病']]
# print(y) # target目标
X = dataset.drop(['是否得病'],axis=1)
# print(X) # 输入的特征
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# 创建一个SVM分类器
model = svm.SVC() # 创建SVM分类器
model = model.fit(x_train, y_train) # 用训练集做训练
prediction = model.predict(x_train) # 用测试集做预测
# p_svm = pd.DataFrame(prediction)
# p_svm.to_csv("train_svm.csv", index=False, sep=',')
# print(prediction)
print('训练集准确率:', metrics.accuracy_score(prediction, y_train))
prediction = model.predict(x_test) # 用测试集做预测
p_svm = pd.DataFrame(prediction)
p_svm.to_csv("test_svm.csv", index=False, sep=',')
print('测试集准确率:', metrics.accuracy_score(prediction, y_test))
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm']/100)**2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14,'是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第一题只分析元数据即可
df = df3[['性别', '年龄', '身高cm', '体重kg', 'BMI','是否得病']]
# step3 模型建立
dataset = pd.get_dummies(df, columns=['性别', '是否得病']) # 将分类变量转成独热变量
# print(dataset.head(5))
standardScaler = StandardScaler()
columns_to_scale = ['年龄', '身高cm', '体重kg','BMI' ]
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
X = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5,4),activation='logistic',max_iter=5000)
mlp.fit(x_train, y_train)
y_t = mlp.predict(x_train)
print('训练集准确率:', metrics.accuracy_score(y_t, y_train))
y_pred = mlp.predict(x_test)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第二题
df = df3[['潮气量(L) ','用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']]
# 标签类别
set(df['是否得病']) #{0,1} 阴性-0,阳性-1
# 统计缺失值
# 潮气量有大量空值
(6520, 10)
潮气量(L) 1974
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
潮气量(L) 用力肺活量(%) ... 外周气道参数D(%) 是否得病
count 4546.000000 6520.000000 ... 6520.000000 6520.000000
mean 1.388097 98.463413 ... 83.906838 0.244939
std 0.558965 11.734124 ... 33.676596 0.430084
min 0.160000 68.711656 ... 0.000000 0.000000
25% 0.990000 89.880350 ... 61.896197 0.000000
50% 1.310000 97.291561 ... 79.493590 0.000000
75% 1.730000 105.834854 ... 101.187574 0.000000
max 4.070000 188.603989 ... 679.464286 1.000000
[8 rows x 10 columns]
Process finished with exit code 0
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
1636 0.650000 106.319703 110.917031 ... 84.887460 62.626263 0
901 1.020000 108.368201 111.500000 ... 91.600000 61.160714 0
478 0.810000 100.000000 99.130435 ... 111.003861 93.534483 0
6145 1.388097 127.960526 124.809160 ... 69.047619 33.913043 1
4401 1.060000 83.266932 91.866029 ... 80.276134 73.660714 0
[5 rows x 10 columns]
潮气量(L) 0
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
# step3 模型建立
dataset = pd.get_dummies(df, columns=['是否得病']) # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['潮气量(L) ','用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
feat_labels = df.columns[0:9] # 特征的名称
# print(feat_labels)
forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1,max_depth=3)
forest.fit(x_train, y_train)
score = forest.score(x_test, y_test) # score=0.98148
importances = forest.feature_importances_ # 随机森林模型认为训练特征的重要程度
indices = np.argsort(importances)[::-1] # 下标排序
for f in range(x_train.shape[1]): # x_train.shape[1]
print("%2d) %-*s %f" % \
(f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
1) FEV1/FVC(%) 0.573146
2) 外周气道参数A(%) 0.172901
3) 外周气道参数B(%) 0.079313
4) 中心气道参数(%) 0.068985
5) 潮气量(L) 0.037866
6) 用力肺活量(%) 0.033652
7) 最高呼气流速(%) 0.022079
8) 外周气道参数D(%) 0.006881
9) 外周气道参数C(%) 0.005177
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第二题
df = df3[['潮气量(L) ','用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']]
# 标签类别
set(df['是否得病']) #{0,1} 阴性-0,阳性-1
# 统计缺失值
# 潮气量有大量空值
(6520, 10)
潮气量(L) 1974
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
潮气量(L) 用力肺活量(%) ... 外周气道参数D(%) 是否得病
count 4546.000000 6520.000000 ... 6520.000000 6520.000000
mean 1.388097 98.463413 ... 83.906838 0.244939
std 0.558965 11.734124 ... 33.676596 0.430084
min 0.160000 68.711656 ... 0.000000 0.000000
25% 0.990000 89.880350 ... 61.896197 0.000000
50% 1.310000 97.291561 ... 79.493590 0.000000
75% 1.730000 105.834854 ... 101.187574 0.000000
max 4.070000 188.603989 ... 679.464286 1.000000
[8 rows x 10 columns]
Process finished with exit code 0
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
1636 0.650000 106.319703 110.917031 ... 84.887460 62.626263 0
901 1.020000 108.368201 111.500000 ... 91.600000 61.160714 0
478 0.810000 100.000000 99.130435 ... 111.003861 93.534483 0
6145 1.388097 127.960526 124.809160 ... 69.047619 33.913043 1
4401 1.060000 83.266932 91.866029 ... 80.276134 73.660714 0
[5 rows x 10 columns]
潮气量(L) 0
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
# step3 模型建立
dataset = pd.get_dummies(df, columns=['是否得病']) # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['潮气量(L) ','用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=500)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)
# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1)) # 混淆矩阵
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
y_test = numpy.array(y_test)
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1)) # 混淆矩阵
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第二题
df = df3[['潮气量(L) ','用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']]
# 标签类别
set(df['是否得病']) #{0,1} 阴性-0,阳性-1
# 统计缺失值
# 潮气量有大量空值
(6520, 10)
潮气量(L) 1974
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
潮气量(L) 用力肺活量(%) ... 外周气道参数D(%) 是否得病
count 4546.000000 6520.000000 ... 6520.000000 6520.000000
mean 1.388097 98.463413 ... 83.906838 0.244939
std 0.558965 11.734124 ... 33.676596 0.430084
min 0.160000 68.711656 ... 0.000000 0.000000
25% 0.990000 89.880350 ... 61.896197 0.000000
50% 1.310000 97.291561 ... 79.493590 0.000000
75% 1.730000 105.834854 ... 101.187574 0.000000
max 4.070000 188.603989 ... 679.464286 1.000000
[8 rows x 10 columns]
Process finished with exit code 0
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
1636 0.650000 106.319703 110.917031 ... 84.887460 62.626263 0
901 1.020000 108.368201 111.500000 ... 91.600000 61.160714 0
478 0.810000 100.000000 99.130435 ... 111.003861 93.534483 0
6145 1.388097 127.960526 124.809160 ... 69.047619 33.913043 1
4401 1.060000 83.266932 91.866029 ... 80.276134 73.660714 0
[5 rows x 10 columns]
潮气量(L) 0
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
# step3 模型建立
# 进行特征选择
df = df[['FEV1/FVC(%)','外周气道参数A(%)','外周气道参数B(%)','中心气道参数(%)','潮气量(L) ','用力肺活量(%)','最高呼气流速(%)','是否得病']]
FEV1/FVC(%) 外周气道参数A(%) 外周气道参数B(%) ... 用力肺活量(%) 最高呼气流速(%) 是否得病
1491 88.235294 56.373938 91.962617 ... 82.156134 89.898990 0
80 79.794521 45.555556 91.546763 ... 97.333333 92.721519 0
296 82.951654 84.552846 94.329897 ... 85.249458 116.040956 0
2187 87.401575 66.954023 67.620751 ... 82.200647 76.863354 1
1071 86.764706 68.318966 99.506579 ... 97.142857 101.744186 0
dataset = pd.get_dummies(df, columns=['是否得病']) # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['FEV1/FVC(%)','外周气道参数A(%)','外周气道参数B(%)','中心气道参数(%)','潮气量(L) ','用力肺活量(%)','最高呼气流速(%)']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=500)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)
# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1)) # 混淆矩阵
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
y_test = numpy.array(y_test)
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1)) # 混淆矩阵
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
[[1119 0]
[ 0 1118]]
训练集准确率: 1.0
[[372 109]
[156 323]]
测试集准确率: 0.7166666666666667
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第二题
df = df3[['潮气量(L) ','用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']]
# 标签类别
set(df['是否得病']) #{0,1} 阴性-0,阳性-1
# 统计缺失值
# 潮气量有大量空值
(6520, 10)
潮气量(L) 1974
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
潮气量(L) 用力肺活量(%) ... 外周气道参数D(%) 是否得病
count 4546.000000 6520.000000 ... 6520.000000 6520.000000
mean 1.388097 98.463413 ... 83.906838 0.244939
std 0.558965 11.734124 ... 33.676596 0.430084
min 0.160000 68.711656 ... 0.000000 0.000000
25% 0.990000 89.880350 ... 61.896197 0.000000
50% 1.310000 97.291561 ... 79.493590 0.000000
75% 1.730000 105.834854 ... 101.187574 0.000000
max 4.070000 188.603989 ... 679.464286 1.000000
[8 rows x 10 columns]
Process finished with exit code 0
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
1636 0.650000 106.319703 110.917031 ... 84.887460 62.626263 0
901 1.020000 108.368201 111.500000 ... 91.600000 61.160714 0
478 0.810000 100.000000 99.130435 ... 111.003861 93.534483 0
6145 1.388097 127.960526 124.809160 ... 69.047619 33.913043 1
4401 1.060000 83.266932 91.866029 ... 80.276134 73.660714 0
[5 rows x 10 columns]
潮气量(L) 0
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
# step3 模型建立
# 进行特征选择
df = df[['FEV1/FVC(%)','外周气道参数A(%)','外周气道参数B(%)','中心气道参数(%)','潮气量(L) ','用力肺活量(%)','最高呼气流速(%)','是否得病']]
FEV1/FVC(%) 外周气道参数A(%) 外周气道参数B(%) ... 用力肺活量(%) 最高呼气流速(%) 是否得病
1491 88.235294 56.373938 91.962617 ... 82.156134 89.898990 0
80 79.794521 45.555556 91.546763 ... 97.333333 92.721519 0
296 82.951654 84.552846 94.329897 ... 85.249458 116.040956 0
2187 87.401575 66.954023 67.620751 ... 82.200647 76.863354 1
1071 86.764706 68.318966 99.506579 ... 97.142857 101.744186 0
# dataset = pd.get_dummies(df, columns=['是否得病']) # 将分类变量转成独热变量
dataset = df
standardScaler = StandardScaler()
columns_to_scale = ['FEV1/FVC(%)','外周气道参数A(%)','外周气道参数B(%)','中心气道参数(%)','潮气量(L) ','用力肺活量(%)','最高呼气流速(%)']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病']]
# print(y) # target目标
x = dataset.drop(['是否得病'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
model = svm.SVC() # 创建SVM分类器
model = model.fit(x_train, y_train) # 用训练集做训练
prediction = model.predict(x_train) # 用测试集做预测
# p_svm = pd.DataFrame(prediction)
# p_svm.to_csv("train_svm.csv", index=False, sep=',')
# print(prediction)
print('训练集准确率:', metrics.accuracy_score(prediction, y_train))
prediction = model.predict(x_test) # 用测试集做预测
p_svm = pd.DataFrame(prediction)
p_svm.to_csv("test_svm.csv", index=False, sep=',')
print('测试集准确率:', metrics.accuracy_score(prediction, y_test))
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第二题
df = df3[['潮气量(L) ','用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']]
# 标签类别
set(df['是否得病']) #{0,1} 阴性-0,阳性-1
# 统计缺失值
# 潮气量有大量空值
(6520, 10)
潮气量(L) 1974
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
潮气量(L) 用力肺活量(%) ... 外周气道参数D(%) 是否得病
count 4546.000000 6520.000000 ... 6520.000000 6520.000000
mean 1.388097 98.463413 ... 83.906838 0.244939
std 0.558965 11.734124 ... 33.676596 0.430084
min 0.160000 68.711656 ... 0.000000 0.000000
25% 0.990000 89.880350 ... 61.896197 0.000000
50% 1.310000 97.291561 ... 79.493590 0.000000
75% 1.730000 105.834854 ... 101.187574 0.000000
max 4.070000 188.603989 ... 679.464286 1.000000
[8 rows x 10 columns]
Process finished with exit code 0
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
1636 0.650000 106.319703 110.917031 ... 84.887460 62.626263 0
901 1.020000 108.368201 111.500000 ... 91.600000 61.160714 0
478 0.810000 100.000000 99.130435 ... 111.003861 93.534483 0
6145 1.388097 127.960526 124.809160 ... 69.047619 33.913043 1
4401 1.060000 83.266932 91.866029 ... 80.276134 73.660714 0
[5 rows x 10 columns]
潮气量(L) 0
用力肺活量(%) 0
中心气道参数(%) 0
FEV1/FVC(%) 0
最高呼气流速(%) 0
外周气道参数A(%) 0
外周气道参数B(%) 0
外周气道参数C(%) 0
外周气道参数D(%) 0
是否得病 0
dtype: int64
# step3 模型建立
# 进行特征选择
df = df[['FEV1/FVC(%)','外周气道参数A(%)','外周气道参数B(%)','中心气道参数(%)','潮气量(L) ','用力肺活量(%)','最高呼气流速(%)','是否得病']]
FEV1/FVC(%) 外周气道参数A(%) 外周气道参数B(%) ... 用力肺活量(%) 最高呼气流速(%) 是否得病
1491 88.235294 56.373938 91.962617 ... 82.156134 89.898990 0
80 79.794521 45.555556 91.546763 ... 97.333333 92.721519 0
296 82.951654 84.552846 94.329897 ... 85.249458 116.040956 0
2187 87.401575 66.954023 67.620751 ... 82.200647 76.863354 1
1071 86.764706 68.318966 99.506579 ... 97.142857 101.744186 0
dataset = pd.get_dummies(df, columns=['是否得病']) # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['FEV1/FVC(%)','外周气道参数A(%)','外周气道参数B(%)','中心气道参数(%)','潮气量(L) ','用力肺活量(%)','最高呼气流速(%)']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5,4),activation='logistic',max_iter=5000)
mlp.fit(x_train, y_train)
y_t = mlp.predict(x_train)
print('训练集准确率:', metrics.accuracy_score(y_t, y_train))
y_pred = mlp.predict(x_test)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
# drop_features = ['性别','外周气道参数C(%)','外周气道参数C(%)']
# df = df3.drop(drop_features, axis=1)
df = df3
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
Index(['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)',
'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)',
'外周气道参数D(%)', 'BMI', '是否得病'],
# step3 模型建立
dataset = pd.get_dummies(df, columns=['是否得病']) # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)',
'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)',
'外周气道参数D(%)', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=500)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)
# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1)) # 混淆矩阵
# print("特征选择之后")
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
y_test = numpy.array(y_test)
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1)) # 混淆矩阵
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
[5 rows x 16 columns]
[[1130 0]
[ 0 1107]]
训练集准确率: 1.0
[[380 90]
[154 336]]
测试集准确率: 0.74375
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
# drop_features = ['性别','外周气道参数C(%)','外周气道参数C(%)']
# df = df3.drop(drop_features, axis=1)
df = df3
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
Index(['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)',
'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)',
'外周气道参数D(%)', 'BMI', '是否得病'],
# step3 模型建立
dataset = df
standardScaler = StandardScaler()
columns_to_scale = ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)',
'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)',
'外周气道参数D(%)', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病']]
# print(y) # target目标
x = dataset.drop(['是否得病'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
model = svm.SVC() # 创建SVM分类器
model = model.fit(x_train, y_train) # 用训练集做训练
prediction = model.predict(x_train) # 用测试集做预测
# p_svm = pd.DataFrame(prediction)
# p_svm.to_csv("train_svm.csv", index=False, sep=',')
# print(prediction)
print('训练集准确率:', metrics.accuracy_score(prediction, y_train))
prediction = model.predict(x_test) # 用测试集做预测
p_svm = pd.DataFrame(prediction)
p_svm.to_csv("test_svm.csv", index=False, sep=',')
print('测试集准确率:', metrics.accuracy_score(prediction, y_test))
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
drop_features = ['性别','外周气道参数C(%)','外周气道参数C(%)']
df = df3.drop(drop_features, axis=1)
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
Index(['年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)',
'最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数D(%)', 'BMI', '是否得病'],
# step3 模型建立
dataset = df
standardScaler = StandardScaler()
columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)',
'最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数D(%)', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病']]
# print(y) # target目标
x = dataset.drop(['是否得病'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
model = svm.SVC() # 创建SVM分类器
model = model.fit(x_train, y_train) # 用训练集做训练
prediction = model.predict(x_train) # 用测试集做预测
# p_svm = pd.DataFrame(prediction)
# p_svm.to_csv("train_svm.csv", index=False, sep=',')
# print(prediction)
print('训练集准确率:', metrics.accuracy_score(prediction, y_train))
prediction = model.predict(x_test) # 用测试集做预测
p_svm = pd.DataFrame(prediction)
p_svm.to_csv("test_svm.csv", index=False, sep=',')
print('测试集准确率:', metrics.accuracy_score(prediction, y_test))
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
drop_features = ['性别','外周气道参数C(%)','外周气道参数C(%)']
df = df3.drop(drop_features, axis=1)
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
Index(['年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)',
'最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数D(%)', 'BMI', '是否得病'],
# step3 模型建立
dataset = pd.get_dummies(df, columns=['是否得病']) # 将分类变量转成独热变量
standardScaler = StandardScaler() # 数据标准化
columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)',
'最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数D(%)', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5,4),activation='logistic',max_iter=5000)
mlp.fit(x_train, y_train)
y_t = mlp.predict(x_train)
print('训练集准确率:', metrics.accuracy_score(y_t, y_train))
y_pred = mlp.predict(x_test)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
机器学习—模型选择与优化7-1(k-fold交叉验证法) - 橘子橘子呀 - 博客园 (cnblogs.com)
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
# df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
# ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ',
# '用力肺活量(%)', '中心气道参数(%)', 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)', '外周气道参数D(%)', '是否得病']
# print(df3)
性别 年龄 身高cm 体重kg ... 外周气道参数B(%) 外周气道参数C(%) 外周气道参数D(%) 是否得病
0 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
1 男 55 151.0 49.0 ... 59.870550 101.844262 83.653846 0
2 男 30 181.0 69.0 ... 64.519906 103.146067 107.100592 0
3 男 25 179.0 75.0 ... 62.237762 104.092072 92.814371 0
4 男 23 171.0 59.0 ... 54.024390 74.943567 48.076923 0
... .. .. ... ... ... ... ... ... ...
6515 女 36 167.0 60.0 ... 96.875000 102.512563 100.000000 1
6516 男 28 183.0 68.0 ... 102.870264 63.942308 82.269504 1
6517 女 36 160.0 55.0 ... 64.957265 58.203125 54.585153 1
6518 女 60 159.0 74.0 ... 100.957854 72.000000 48.529412 1
6519 女 63 156.0 46.0 ... 66.336634 92.553191 87.368421 1
[6520 rows x 14 columns]
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
# print(df3)
# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
# drop_features = ['性别','外周气道参数C(%)','外周气道参数C(%)']
# df = df3.drop(drop_features, axis=1)
df = df3
mean_val =df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
Index(['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)',
'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)',
'外周气道参数D(%)', 'BMI', '是否得病'],
# step3 模型建立
dataset = pd.get_dummies(df, columns=['是否得病']) # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['性别', '年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)',
'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', '外周气道参数C(%)',
'外周气道参数D(%)', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 切分训练集与测试集,注意所有的交叉验证等都是在训练集上做的操作,测试集只有最后的最后才会使用到
# 创建一个随机森林实例
rf = RandomForestClassifier(random_state=42, n_estimators=500)
rf_mse = cross_val_score(estimator = rf , X = x_train, y = y_train, scoring = 'r2', cv = 5, verbose = 1, n_jobs=6)
print('随机森林模型中,R2的平均数是 %.4f,标准差是 %.4f' %(rf_mse.mean(), rf_mse.std()))
mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5,4),activation='logistic',max_iter=5000)
mlp_mse = cross_val_score(estimator = mlp, X = x_train, y = y_train, scoring = 'r2', cv = 5, verbose = 1, n_jobs=6)
print('MLP模型中,R2的平均数是 %.4f,标准差是 %.4f' %(mlp_mse.mean(), mlp_mse.std()))
# 平均数最高(低偏差),标准差最小(低方差) 就是好模型
# 随机森林模型中,R2的平均数是 -0.2770,标准差是 0.0515
# MLP模型中,R2的平均数是 -0.3461,标准差是 0.0922
# 因此模型选择为 随机森林
RandomForest 随机森林算法与模型参数的调优 - 码农充电站 - 博客园 (cnblogs.com)
(88条消息) gridsearchcv参数_随机森林算法参数解释及调优_weixin_39953578的博客-CSDN博客
# 参数优化 对n_estimators参数择优
randomforest = RandomForestClassifier(random_state=42)
param_test1 = {"n_estimators": range(1, 101, 10)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_test1,
scoring='roc_auc', cv=10)
gsearch1.fit(x_train, y_train)
print("best accuracy:%f" % gsearch1.best_score_)
{'n_estimators': 81}
best accuracy:0.790081
# 参数优化 最大特征数max_features,其他参数设置为常数,且n_estimators为81
randomforest = RandomForestClassifier(random_state=42)
param_test2 = {"max_features":range(1,11,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=81,
param_grid = param_test2,scoring='roc_auc',cv=10)
gsearch1.fit(x_train, y_train)
print("best accuracy:%f" % gsearch1.best_score_)
{'n_estimators': 81}
best accuracy:0.790081
{'max_features': 9}
best accuracy:0.797361
randomforest = RandomForestClassifier(random_state=42,n_estimators=81,max_features=9)
model = randomforest.fit(x_train, y_train)
# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1)) # 混淆矩阵
# print("特征选择之后")
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
y_test = numpy.array(y_test)
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1)) # 混淆矩阵
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
[[1108 0]
[ 0 1129]]
训练集准确率: 1.0
[[393 99]
[147 321]]
测试集准确率: 0.74375
import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import recall_score #召回率
# step1 数据预处理
# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
# df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据
# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0) # 阴性为0
# print(df1) # 1 男 40 176.0 67.0 ... 64.661654 105.070423 76.190476 0
df2.insert(loc=13, column='是否得病', value=1) # 阳性为1
# print(df2) # 1 男 20 174.0 77.0 ... 62.248521 67.021277 58.035714 1
# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# 增加BMI参数 BMI=体重(kg)÷身高2(m2)。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)
df3 = shuffle(df3)
drop_features = ['性别', '外周气道参数C(%)', '外周气道参数D(%)'] # 之前相关性啥的不重要的特征给他删除
df = df3.drop(drop_features, axis=1)
# 使用均值填充缺失值
mean_val = df['潮气量(L) '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量(L) '].fillna(mean_val, inplace=True)
# print(df.head(5))
# print(df.isna().sum())
dataset = df
x = dataset.iloc[:, :-1] # 特征
y = dataset.iloc[:, -1] # 标签
groupby_data_o = dataset.groupby(['是否得病'])['是否得病'].count() #标签类别分类计数
# 分析样本不平衡问题
0 4923
1 1597 存在样本不平衡问题
Name: 是否得病, dtype: int64
# 使用过采样
# 使用SMOTE方法进行过抽样处理
from imblearn.over_sampling import SMOTE # 过抽样处理库SMOTE
model_smote = SMOTE()
x_smote_resampled, y_smote_resampled = model_smote.fit_resample(x, y) # 输入数据进行过抽样处理
y_smote_resampled = pd.DataFrame(y_smote_resampled, columns=['是否得病'])
smote_resampled = pd.concat([x_smote_resampled, y_smote_resampled], axis=1) # 将特征和标签重新拼接
group_data_smote = smote_resampled.groupby(['是否得病'])['是否得病'].count() # 查看标签类别个数
0 4923
1 4923
Name: 是否得病, dtype: int64
dataset = pd.get_dummies(smote_resampled, columns=['是否得病']) # 将分类变量转成独热变量
standardScaler = StandardScaler() # 标准化
columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)',
'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征
# 随机森林 不管样本分类不平衡
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42,n_estimators=81,max_features=9)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)
# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1)) # 混淆矩阵
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
y_test = numpy.array(y_test)
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1)) # 混淆矩阵
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
[[3477 0]
[ 0 3415]]
训练集准确率: 1.0
[[1130 316]
[ 226 1282]]
测试集准确率: 0.8165199729180772
# # 使用RandomUnderSampler进行欠抽样处理
# from imblearn.under_sampling import RandomUnderSampler # 欠抽样处理库RandomUnderSampler
# model_RandomUnderSampler = RandomUnderSampler() # 实例化
# x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled = model_RandomUnderSampler.fit_resample(x,y) # 输入数据进行欠抽样处理
# y_RandomUnderSampler_resampled = pd.DataFrame(y_RandomUnderSampler_resampled, columns=['是否得病'])
# RandomUnderSampler_resampled = pd.concat([x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled],
# axis=1) # 将特征和标签重新拼接
# group_data_RandomUnderSampler = RandomUnderSampler_resampled.groupby(['是否得病'])['是否得病'].count() # 查看标签类别个数
# print(group_data_RandomUnderSampler)
# # RandomUnderSampler_resampled 采样后的数据
# dataset = pd.get_dummies(RandomUnderSampler_resampled, columns=['是否得病']) # 将分类变量转成独热变量
# standardScaler = StandardScaler() # 标准化
# columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)',
# 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', 'BMI']
# dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
# print(dataset.head(5))
# y = dataset[['是否得病_0','是否得病_1']]
# # print(y) # target目标
# x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# # print(X) # 输入的特征
# # 随机森林 不管样本分类不平衡
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# # 创建一个随机森林分类器的实例
# randomforest = RandomForestClassifier(random_state=42,n_estimators=81,max_features=9)
# # 利用训练集样本对分类器模型进行训练
# model = randomforest.fit(x_train, y_train)
# # print(model.predict(x_train))
# # print(y_train)
# # print(type(model.predict(x_train)))
# y_train = numpy.array(y_train)
# xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1)) # 混淆矩阵
# print("使用欠采样之后")
# print('训练集混淆矩阵为:\n',xm)
# print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
# y_test = numpy.array(y_test)
# y_pred = model.predict(x_test)
# cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1)) # 混淆矩阵
# print('测试集混淆矩阵为:\n',cm)
# print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
# '''
# 使用欠采样之后
# 训练集混淆矩阵为:
# [[1089 0]
# [ 0 1146]]
# 训练集准确率: 1.0
# 测试集混淆矩阵为:
# [[321 187]
# [145 306]]
# 测试集准确率: 0.6538060479666319
# '''
# dataset = pd.get_dummies(df, columns=['是否得病']) # 将分类变量转成独热变量
# standardScaler = StandardScaler() # 标准化
# columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量(L) ', '用力肺活量(%)', '中心气道参数(%)',
# 'FEV1/FVC(%)', '最高呼气流速(%)', '外周气道参数A(%)', '外周气道参数B(%)', 'BMI']
# dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
# print(dataset.head(5))
# y = dataset[['是否得病_0','是否得病_1']]
# # print(y) # target目标
# x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# # print(X) # 输入的特征
# # 随机森林 不管样本分类不平衡
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# # 创建一个随机森林分类器的实例
# randomforest = RandomForestClassifier(random_state=42,n_estimators=81,max_features=9)
# # 利用训练集样本对分类器模型进行训练
# model = randomforest.fit(x_train, y_train)
# # print(model.predict(x_train))
# # print(y_train)
# # print(type(model.predict(x_train)))
# y_train = numpy.array(y_train)
# xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1)) # 混淆矩阵
# # print("特征选择之后")
# print('训练集混淆矩阵为:\n',xm)
# print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
# y_test = numpy.array(y_test)
# y_pred = model.predict(x_test)
# cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1)) # 混淆矩阵
# print('测试集混淆矩阵为:\n',cm)
# print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
# '''
# 训练集混淆矩阵为:
# [[3459 0]
# [ 0 1105]]
# 训练集准确率: 1.0
# 测试集混淆矩阵为:
# [[1362 102]
# [ 380 112]]
# 测试集准确率: 0.7535787321063395
# '''
plt.imshow(cm, cmap=plt.cm.Blues)
indices = range(len(cm))
plt.xticks(indices, [0,1])
plt.yticks(indices, [0,1])
for first_index in range(len(cm)):
for second_index in range(len(cm[first_index])):
plt.text(first_index, second_index, cm[second_index][first_index])