Pima Indians数据集为糖尿病患者医疗记录数据,是一个二分类问题。本代码采用80%数据训练,20%数据测试的方法。若数据不做归一化处理,最终模型的分类精度为 79.17%;而数据进行归一化以后,最终模型的分类精度为81.38%。
"""
数据集中的feature解释:
Number of times pregnent: 怀孕次数
Plasma glucose concentration a 2 hours in an oral glucose tolerance test: 2小时口服葡萄糖耐量试验中血浆葡萄糖浓度
Diastolic blood pressure(mm Hg): 舒张压
Triceps skin fold thickness(mm): 三头肌皮褶皱厚度
2-hour serum insulin(mu U/ml): 2小时血清胰岛素
Body mass index(weight in kg/(height in m)^2): 身体质量指数
Diabetes pedigree function: 糖尿病谱系功能
Age(years): 年龄
Class variable(0 or 1): 是否是糖尿病
"""
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# 选用初始化随机数种子,确保输出结果的可重复
seed = 4
np.random.seed(seed)
# 避免第一行变为列名
dataset = pd.read_csv('../data/pima-indians-diabetes.csv', header=None, names=list(np.arange(9)))
# 修改列名
dataset.columns = ['Number of times pregnent', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test',
'Diastolic blood pressure(mm Hg)', 'Triceps skin fold thickness(mm)', '2-hour serum insulin(mu U/ml)',
'Body mass index(weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age(years)',
'Class variable(0 or 1)']
train_label = dataset['Class variable(0 or 1)']
dataset.drop('Class variable(0 or 1)',axis=1, inplace=True)
train_feature = dataset
# 数据归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
train_feature = min_max_scaler.fit_transform(train_feature)
# 将数据集分为train,test
x_train, x_validation, Y_train, Y_validation = train_test_split(train_feature, train_label, test_size=0.2, random_state=seed)
del dataset
# 创建模型
model = Sequential()
# 添加层
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# 编译模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 训练模型并自动评估模型
model.fit(x_train, Y_train, validation_data=(x_validation, Y_validation), epochs=150, batch_size=1)
# 评估模型
scores = model.evaluate(x=x_validation, y=Y_validation)
print('\n%s : %.2f%%' % (model.metrics_names[1], scores[1]*100))
from sklearn.model_selection import StratifiedKFold
"""
数据集中的feature解释:
Number of times pregnent: 怀孕次数
Plasma glucose concentration a 2 hours in an oral glucose tolerance test: 2小时口服葡萄糖耐量试验中血浆葡萄糖浓度
Diastolic blood pressure(mm Hg): 舒张压
Triceps skin fold thickness(mm): 三头肌皮褶皱厚度
2-hour serum insulin(mu U/ml): 2小时血清胰岛素
Body mass index(weight in kg/(height in m)^2): 身体质量指数
Diabetes pedigree function: 糖尿病谱系功能
Age(years): 年龄
Class variable(0 or 1): 是否是糖尿病
"""
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
# 选用初始化随机数种子,确保输出结果的可重复
seed = 4
np.random.seed(seed)
# 避免第一行变为列名
dataset = pd.read_csv('../data/pima-indians-diabetes.csv', header=None, names=list(np.arange(9)))
# 修改列名
dataset.columns = ['Number of times pregnent', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test',
'Diastolic blood pressure(mm Hg)', 'Triceps skin fold thickness(mm)', '2-hour serum insulin(mu U/ml)',
'Body mass index(weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age(years)',
'Class variable(0 or 1)']
train_label = dataset['Class variable(0 or 1)']
dataset.drop('Class variable(0 or 1)',axis=1, inplace=True)
train_feature = dataset
# 数据归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
train_feature = min_max_scaler.fit_transform(train_feature)
# 将数据集分为train,test n_splits = 10 代表10折 shuffle=True 打乱
kfold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
cvscores = []
del dataset
# K折交叉验证
for train, validation in kfold.split(train_feature, train_label):
# 创建模型
model = Sequential()
# 添加层
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# 编译模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 训练模型并自动评估模型
model.fit(train_feature[train], train_label[train], epochs=150, batch_size=10, verbose=0)
# 评估模型
scores = model.evaluate(train_feature[validation], train_label[validation])
print('\n%s : %.2f%%' % (model.metrics_names[1], scores[1]*100))
cvscores.append(scores[1]*100)
# 输出均值和标准差
print('%.2f%% (+/- %.2f%%)' % (np.mean(cvscores), np.std(cvscores)))
Pima Indians数据集为糖尿病患者医疗记录数据,是一个二分类问题。本代码采用80%数据训练,20%数据测试的方法。若数据不做归一化处理,最终模型的分类精度为 79.17%;而数据进行归一化以后,最终模型的分类精度为81.38%。
其中还包括一份10折交叉验证的代码,最终的运行结果为76.69% (+/- 2.95%)。
sklearn结尾的代码为用sklearn包的KerasClassifier进行多分类,通过10折交叉验证,得到最终的精度为0.7681989076737664
GridSearch结尾的代码为用sklearn包的GridSearchCV搜索超参,得到最终的结果为Best: 0.781250 using {‘batch_size’: 10, ‘epochs’: 150, ‘init’: ‘normal’, ‘optimizer’: ‘rmsprop’}
from keras.wrappers.scikit_learn import KerasClassifier
KerasClassifier(build_fn=create_model, verbose=0)
model = KerasClassifier(build_fn=create_model, verbose=0)
param_grid['optimizer'] = ['rmsprop', 'adam']
param_grid['init'] = ['glorot_uniform', 'normal', 'uniform']
param_grid['epochs'] = [50, 100, 150, 200]
param_grid['batch_size'] = [5, 10, 20]
grid = GridSearchCV(estimator=model, param_grid=param_grid)
results = grid.fit(train_feature, train_label)
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
def create_model(optimizer='adam', init='glorot_uniform'):
# 创建模型
model = Sequential()
# 添加层,input_dim为特征的个数
model.add(Dense(units=12, kernel_initializer=init, input_dim=8, activation='relu'))
model.add(Dense(units=8, kernel_initializer=init, activation='relu'))
# 这里的1为输出多少列结果,二分类为1列
model.add(Dense(units=1, kernel_initializer=init, activation='sigmoid'))
# 编译模型
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
# 选用初始化随机数种子,确保输出结果的可重复
seed = 4
np.random.seed(seed)
# 避免第一行变为列名
dataset = pd.read_csv('../data/pima-indians-diabetes.csv', header=None, names=list(np.arange(9)))
# 修改列名
dataset.columns = ['Number of times pregnent', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test',
'Diastolic blood pressure(mm Hg)', 'Triceps skin fold thickness(mm)', '2-hour serum insulin(mu U/ml)',
'Body mass index(weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age(years)',
'Class variable(0 or 1)']
train_label = dataset['Class variable(0 or 1)']
dataset.drop('Class variable(0 or 1)',axis=1, inplace=True)
train_feature = dataset
# 数据归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
train_feature = min_max_scaler.fit_transform(train_feature)
# 创建模型 for scikit-learn
# verbose:日志冗长度,int:冗长度,0:不输出训练过程,1:偶尔输出,>1:对每个子模型都输出。
model = KerasClassifier(build_fn=create_model, verbose=0)
del dataset
# 构建需要调参的参数
param_grid = {}
param_grid['optimizer'] = ['rmsprop', 'adam']
param_grid['init'] = ['glorot_uniform', 'normal', 'uniform']
param_grid['epochs'] = [50, 100, 150, 200]
param_grid['batch_size'] = [5, 10, 20]
# 调参
grid = GridSearchCV(estimator=model, param_grid=param_grid)
results = grid.fit(train_feature, train_label)
# 输出结果
print('Best: %f using %s' % (results.best_score_, results.best_params_))
# cv_results_为字典,可以转为DataFrame
means = results.cv_results_['mean_test_score']
stds = results.cv_results_['std_test_score']
params = results.cv_results_['params']
# zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表。
for mean, std, param in zip(means, stds, params):
# %r打印时能够重现它所代表的对象
print('%f (%f) with: %r' % (mean, std, param))
GridSearch结尾的代码为用sklearn包的GridSearchCV
搜索超参,得到最终的结果为Best: 0.781250 using {‘batch_size’: 10, ‘epochs’: 150, ‘init’: ‘normal’, ‘optimizer’: ‘rmsprop’}
from sklearn.model_selection import StratifiedKFold, cross_val_score
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier
def create_model():
# 创建模型
model = Sequential()
# 添加层
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# 编译模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# 选用初始化随机数种子,确保输出结果的可重复
seed = 4
np.random.seed(seed)
# 避免第一行变为列名
dataset = pd.read_csv('../data/pima-indians-diabetes.csv', header=None, names=list(np.arange(9)))
# 修改列名
dataset.columns = ['Number of times pregnent', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test',
'Diastolic blood pressure(mm Hg)', 'Triceps skin fold thickness(mm)', '2-hour serum insulin(mu U/ml)',
'Body mass index(weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age(years)',
'Class variable(0 or 1)']
train_label = dataset['Class variable(0 or 1)']
dataset.drop('Class variable(0 or 1)',axis=1, inplace=True)
train_feature = dataset
# 数据归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
train_feature = min_max_scaler.fit_transform(train_feature)
# 创建模型 for scikit-learn
model = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0)
del dataset
# 10折交叉验证
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
# cv:选择每次测试折数 150
results = cross_val_score(model, train_feature, train_label, cv=kfold)
# 0.763004102804
print(results.mean())
0.763004102804