一次使用多个机器学习模型训练自己的数据
使用sklearn 架构,使用随机森林,逻辑回归,支持向量机,xgboost训练模型,并计算测试数据的准确率和混淆矩阵
- 随机森林
- 逻辑回归
- xgboost
- 支持向量机
- 神经网络
import numpy as np
import pandas as pd
import argparse
from time import time
from datetime import datetime
from sklearn.metrics import confusion_matrix as CM # 导入混淆矩阵计算模块
from sklearn.metrics import accuracy_score as ACCS #导入准确率计算模块
parser = argparse.ArgumentParser(description='ML training and testing using RF,SVM,LR with input of matrix and target')
parser.add_argument("allmatrix",help="input all the datamatrix in dataframe")
parser.add_argument("target",help="input label data with np.txt format")
args = parser.parse_args()
#record time costed
def timerecord():
print("time cost: {}".format(datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))) # 记录模型使用时间
# load feature matrix and label
allmatrix = pd.read_csv(args.allmatrix,index_col=0,header=0)
target = np.loadtxt(args.target,dtype='int32')
print("allmatrix shape: {}".format(allmatrix.shape))
print("target shape: {}".format(target.shape))
print("chrom NO.: ",(target==0).sum())
print("plas NO.: ",(target==1).sum())
#随机森林训练
# Random Forest training and cross_validation
print("random forest training result")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
Xtrain,Xtest,Ytrain,Ytest = train_test_split(allmatrix,target,test_size=0.3,random_state=420)
rfc = RandomForestClassifier(n_estimators=100,random_state=90,n_jobs = -1)
rfc = rfc.fit(Xtrain,Ytrain)
pred_rfc = rfc.predict(Xtest)
score=ACCS(Ytest, pred_rfc)
print("RFC 1st training test score: {}".format(score))
print("Training data score: {}".format(rfc.score(Xtrain,Ytrain)))
print("\n")
#获得混淆矩阵
# get confusion_matrix
cm_rfc = CM(Ytest, pred_rfc)
print("confusion matrix of rfc is: {}".format(cm_rfc))
print("\n")
#cv=5 cross validation,5倍交叉验证
time0=time()
score_val = cross_val_score(rfc,allmatrix,target,cv=5)
print("RFC n_estimator=100 cv=5 corssvalidation score: {}".format(score_val))
print("RFC n_estimator=100 cv=5 cv score mean: {}".format(score_val.mean()))
timerecord()
print("\n")
# n_estimator=200 training # 增加随机森林的树到200棵
rfc200 = RandomForestClassifier(n_estimators=200,random_state=90,n_jobs = -1)
rfc200 = rfc200.fit(Xtrain,Ytrain)
score=rfc200.score(Xtest,Ytest)
print("RFC n_estimator=200 training test score: {}".format(score))
#cv=5 cross validation 200棵树的随机森林交叉验证
time0=time()
score_val200 = cross_val_score(rfc200,allmatrix,target,cv=5)
print("n_estimators=200, cv5 validation score:{} with mean of {}".format(score_val200,score_val200.mean()))
timerecord()
print("\n")
# Logistical regression training and cross validation,逻辑回归训练,使用L2正则化
print("Logistic regression training")
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score
time0=time()
lr2=LR(penalty='l2',solver='liblinear',max_iter=1000)
lr2=lr2.fit(Xtrain,Ytrain)
pred_lr2= lr2.predict(Xtest)
score_lr = ACCS(Ytest, pred_lr2)
print("LR training with lr2 norm test score: {}".format(score_lr))
print("LR training with lr2 norm training data score: {}".format(lr2.score(Xtrain,Ytrain)))
timerecord()
#confusion matrix
cm_lr2 = CM(Ytest, pred_lr2)
print("LR2 cm is {}".format(cm_lr2))
# LR corss validation
time0=time()
lr2 = LR(penalty='l2',solver='liblinear',max_iter=1000)
val_lr2=cross_val_score(lr2,allmatrix,target,cv=5)
print("LR_lr2 CV=5 cross_validation score: {} with mean: {}".format(val_lr2,val_lr2.mean()))
timerecord()
print("\n")
# lr1 norm test 逻辑回归训练,使用L1正则化
time0 = time()
lr1=LR(penalty='l1',solver='liblinear',max_iter=1000)
val_lr1=cross_val_score(lr1,allmatrix,target,cv=5)
print("lr1 cv=5 val score: {} with mean: {}".format(val_lr1,val_lr1.mean()))
timerecord()
# SVM training,支持向量机训练。大型数据集比较慢
print("\n","SVM training")
# using linear kernel # 使用线性核
from sklearn import svm
time0=time()
clf=svm.SVC(kernel='linear').fit(Xtrain, Ytrain)
pred_svm = clf.predict(Xtest)
clf_score = ACCS(Ytest,pred_svm)
print("svm training using linear kernel test score: {}".format(clf_score))
timerecord()
#confusion matrix
#cm_svm = CM(Ytest, pred_svm)
#print("svm_linear cm is {}".format(cm_svm))
# using rbf kernel # 使用非线性核rbf
#clf=svm.SVC(kernel='rbf',gamma='auto').fit(Xtrain, Ytrain)
#pred_svmrbf = clf.predict(Xtest)
#clf_score_rbf = ACCS(Ytest,pred_svmrbf)
#print("svm training using rbf kernel test score: {}".format(clf_score_rbf))
#timerecord()
#confusion matrix
#cm_svmrbf = CM(Ytest, pred_svmrbf)
#print("svm_rbf cm is {}".format(cm_svmrbf))
#cross_validation
#if clf_score >= clf_score_rbf:
# clf_val = cross_val_score(svm.SVC(kernel='linear'),allmatrix,target,cv=5)
# print("svm cv=5 corss validation using linear kernel: {}".format(clf_val))
# timerecord()
#else:
# clf_val = cross_val_score(svm.SVC(kernel='rbf'),allmatrix,target,cv=5)
# print("svm cv=5 corss validation using rbf kernel: {}".format(clf_val))
# timerecord()
#Xgboost training , xgboost 模型训练
print("\n")
print("xgboost trainning")
import xgboost as xgb
dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)
param= {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred[ypred != 1] = 0
xgb_score=ACCS(Ytest, ypred)
print('xgb score: ',xgb_score)
# get confusion matrix
cm_xgb = CM(Ytest, ypred)
print("xgb cm is {}".format(cm_xgb))
# Neural network trainning #神经网络训练
print("\n")
print("Neuro networking training")
from sklearn.neural_network import MLPClassifier
time0 = time()
dnn = MLPClassifier(hidden_layer_sizes=(50,50),random_state = 420) # 2层神经网络,每层50个神经元
dnn = dnn.fit(Xtrain,Ytrain)
pred_dnn = dnn.predict(Xtest)
score_dnn = ACCS(Ytest, pred_dnn)
print("dnn training with laryer (50, 50) test score: {}".format(score_dnn))
timerecord()
#confusion matrix
cm_dnn = CM(Ytest, pred_dnn)
print("MLPC DNN(50,50) cm is {}".format(cm_dnn))
# dnn cross validation
time0 = time()
dnn_val = cross_val_score(dnn,allmatrix,target,cv=5)
print("dnn(50,50) cv=5 cross validation score: {} with mean {}".format(dnn_val,dnn_val.mean()))
timerecord()
# standarized matrix # 使用标准化后的训练集,可以有效提高神经网络模型的训练效果
print("\n")
print("standardized matrix")
from sklearn.preprocessing import StandardScaler as SS
X_std=SS().fit_transform(allmatrix)
dnn = MLPClassifier(hidden_layer_sizes=(50,50),random_state = 420)
dnn_std = cross_val_score(dnn, X_std, target, cv=5)
print("dnn_std cross validaion score",dnn_std)
print('dnn(50,50) std data cv=5 cross validation score mean: ', dnn_std.mean())
# increase num of laryers
print("\n")
print("(200,200,100,100,50,25,10,2) training")
dnn = MLPClassifier(hidden_layer_sizes=(200,200,100,100,50,25,10,2),random_state = 420)
Xtrain_s,Xtest_s,Ytrain_s,Ytest_s = train_test_split(X_std,target,test_size=0.3,random_state=420) #划分标准化的数据的训练集和测试集
dnn = dnn.fit(Xtrain_s,Ytrain_s)
pred_dnn7 = dnn.predict(Xtest_s)
score_dnn7 = ACCS(Ytest_s, pred_dnn7)
print("7 laryer test standard data test score: {}".format(score_dnn7))
#7 layres cnn confusion matrix
cm_dnn7 = CM(Ytest_s, pred_dnn7)
print("MLPC 7 layres DNN cm is {}".format(cm_dnn7))
# 7 layers dnn crossvalidation
dnn = MLPClassifier(hidden_layer_sizes=(200,200,100,100,50,25,10,2),random_state = 420) # 提高神经网络数据到7层,每层神经元个数为# 200,200,100,100,50,25,10,2
dnn_std_7 = cross_val_score(dnn, X_std, target, cv=5)
print("dnn_std_7 cv=5 scores: ",dnn_std_7)
print("(200,200,100,100,50,25,10,2) training score mean: ",dnn_std_7.mean())
timerecord()