一次使用随机森林,罗辑回归,xgboost,及神经网络训练机器学习模型训练和测试分类数据

一次使用多个机器学习模型训练自己的数据

使用sklearn 架构,使用随机森林,逻辑回归,支持向量机,xgboost训练模型,并计算测试数据的准确率和混淆矩阵

  1. 随机森林
  2. 逻辑回归
  3. xgboost
  4. 支持向量机
  5. 神经网络
import numpy as np
import pandas as pd
import argparse
from time import time
from datetime import datetime
from sklearn.metrics import confusion_matrix as CM # 导入混淆矩阵计算模块
from sklearn.metrics import accuracy_score as ACCS #导入准确率计算模块


parser = argparse.ArgumentParser(description='ML training and testing using RF,SVM,LR with input of matrix and target')
parser.add_argument("allmatrix",help="input all the datamatrix in dataframe")
parser.add_argument("target",help="input label data with np.txt format")
args = parser.parse_args()


#record time costed
def timerecord():
    print("time cost: {}".format(datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))) # 记录模型使用时间

# load feature matrix and label
allmatrix = pd.read_csv(args.allmatrix,index_col=0,header=0)
target = np.loadtxt(args.target,dtype='int32')
print("allmatrix shape: {}".format(allmatrix.shape))
print("target shape: {}".format(target.shape))
print("chrom NO.: ",(target==0).sum())
print("plas NO.: ",(target==1).sum())


#随机森林训练
# Random Forest training and cross_validation 
print("random forest training result")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

Xtrain,Xtest,Ytrain,Ytest = train_test_split(allmatrix,target,test_size=0.3,random_state=420)
rfc = RandomForestClassifier(n_estimators=100,random_state=90,n_jobs = -1)
rfc = rfc.fit(Xtrain,Ytrain)
pred_rfc = rfc.predict(Xtest)
score=ACCS(Ytest, pred_rfc)
print("RFC 1st training test score: {}".format(score))
print("Training data score: {}".format(rfc.score(Xtrain,Ytrain)))
print("\n")

#获得混淆矩阵
# get confusion_matrix
cm_rfc = CM(Ytest, pred_rfc)
print("confusion matrix of rfc is: {}".format(cm_rfc))
print("\n")

#cv=5 cross validation,5倍交叉验证
time0=time()
score_val = cross_val_score(rfc,allmatrix,target,cv=5)
print("RFC n_estimator=100 cv=5 corssvalidation score: {}".format(score_val))
print("RFC n_estimator=100 cv=5 cv score mean: {}".format(score_val.mean()))
timerecord()
print("\n")

# n_estimator=200 training  # 增加随机森林的树到200棵
rfc200 = RandomForestClassifier(n_estimators=200,random_state=90,n_jobs = -1)
rfc200 = rfc200.fit(Xtrain,Ytrain)
score=rfc200.score(Xtest,Ytest)
print("RFC n_estimator=200 training test score: {}".format(score))

#cv=5 cross validation 200棵树的随机森林交叉验证
time0=time()
score_val200 = cross_val_score(rfc200,allmatrix,target,cv=5)
print("n_estimators=200, cv5 validation score:{} with mean of {}".format(score_val200,score_val200.mean()))
timerecord()
print("\n")

# Logistical regression training and cross validation,逻辑回归训练,使用L2正则化
print("Logistic regression training")
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score
time0=time()
lr2=LR(penalty='l2',solver='liblinear',max_iter=1000)
lr2=lr2.fit(Xtrain,Ytrain)
pred_lr2= lr2.predict(Xtest)
score_lr = ACCS(Ytest, pred_lr2)
print("LR training with lr2 norm test score: {}".format(score_lr))
print("LR training with lr2 norm training data score: {}".format(lr2.score(Xtrain,Ytrain)))
timerecord()

#confusion matrix
cm_lr2 = CM(Ytest, pred_lr2)
print("LR2 cm is {}".format(cm_lr2))

# LR corss validation
time0=time()
lr2 = LR(penalty='l2',solver='liblinear',max_iter=1000)
val_lr2=cross_val_score(lr2,allmatrix,target,cv=5)
print("LR_lr2 CV=5 cross_validation score: {} with mean: {}".format(val_lr2,val_lr2.mean()))
timerecord()
print("\n")

# lr1 norm test 逻辑回归训练,使用L1正则化
time0 = time()
lr1=LR(penalty='l1',solver='liblinear',max_iter=1000)
val_lr1=cross_val_score(lr1,allmatrix,target,cv=5)
print("lr1 cv=5 val score: {} with mean: {}".format(val_lr1,val_lr1.mean()))
timerecord()

# SVM training,支持向量机训练。大型数据集比较慢
print("\n","SVM training")
# using linear kernel # 使用线性核
from sklearn import svm
time0=time()
clf=svm.SVC(kernel='linear').fit(Xtrain, Ytrain)
pred_svm = clf.predict(Xtest)
clf_score = ACCS(Ytest,pred_svm)
print("svm training using linear kernel test score: {}".format(clf_score))
timerecord()

#confusion matrix
#cm_svm = CM(Ytest, pred_svm)
#print("svm_linear cm is {}".format(cm_svm))

# using rbf kernel  # 使用非线性核rbf
#clf=svm.SVC(kernel='rbf',gamma='auto').fit(Xtrain, Ytrain)
#pred_svmrbf = clf.predict(Xtest)
#clf_score_rbf = ACCS(Ytest,pred_svmrbf)
#print("svm training using rbf kernel test score: {}".format(clf_score_rbf))
#timerecord()
#confusion matrix
#cm_svmrbf = CM(Ytest, pred_svmrbf)
#print("svm_rbf cm is {}".format(cm_svmrbf))

#cross_validation
#if clf_score >= clf_score_rbf:
#    clf_val = cross_val_score(svm.SVC(kernel='linear'),allmatrix,target,cv=5)
#    print("svm cv=5 corss validation using linear kernel: {}".format(clf_val))
#    timerecord()
#else:
#    clf_val = cross_val_score(svm.SVC(kernel='rbf'),allmatrix,target,cv=5)
#    print("svm cv=5 corss validation using rbf kernel: {}".format(clf_val))
#    timerecord()

#Xgboost training , xgboost 模型训练
print("\n")
print("xgboost trainning")
import xgboost as xgb
dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)
param= {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)

ypred = preds.copy()
ypred[preds > 0.5] = 1 
ypred[ypred != 1] = 0
xgb_score=ACCS(Ytest, ypred)
print('xgb score: ',xgb_score)

# get confusion matrix
cm_xgb = CM(Ytest, ypred)
print("xgb cm is {}".format(cm_xgb))

# Neural network trainning  #神经网络训练
print("\n")
print("Neuro networking training")
from sklearn.neural_network import MLPClassifier
time0 = time()
dnn = MLPClassifier(hidden_layer_sizes=(50,50),random_state = 420) # 2层神经网络,每层50个神经元
dnn = dnn.fit(Xtrain,Ytrain)
pred_dnn = dnn.predict(Xtest)
score_dnn = ACCS(Ytest, pred_dnn)
print("dnn training with laryer (50, 50) test score: {}".format(score_dnn))
timerecord()

#confusion matrix
cm_dnn = CM(Ytest, pred_dnn)
print("MLPC DNN(50,50) cm is {}".format(cm_dnn))
# dnn cross validation
time0 = time()
dnn_val = cross_val_score(dnn,allmatrix,target,cv=5)
print("dnn(50,50) cv=5 cross validation score: {} with mean {}".format(dnn_val,dnn_val.mean()))
timerecord()

# standarized matrix  # 使用标准化后的训练集,可以有效提高神经网络模型的训练效果
print("\n")
print("standardized matrix")
from sklearn.preprocessing import StandardScaler as SS
X_std=SS().fit_transform(allmatrix)
dnn = MLPClassifier(hidden_layer_sizes=(50,50),random_state = 420)
dnn_std = cross_val_score(dnn, X_std, target, cv=5)
print("dnn_std cross validaion score",dnn_std)
print('dnn(50,50) std data cv=5 cross validation score mean: ', dnn_std.mean())

# increase num of laryers
print("\n")
print("(200,200,100,100,50,25,10,2) training")
dnn = MLPClassifier(hidden_layer_sizes=(200,200,100,100,50,25,10,2),random_state = 420)
Xtrain_s,Xtest_s,Ytrain_s,Ytest_s = train_test_split(X_std,target,test_size=0.3,random_state=420)  #划分标准化的数据的训练集和测试集
dnn = dnn.fit(Xtrain_s,Ytrain_s)
pred_dnn7 = dnn.predict(Xtest_s)
score_dnn7 = ACCS(Ytest_s, pred_dnn7)
print("7 laryer test standard data test score: {}".format(score_dnn7))

#7 layres cnn confusion matrix
cm_dnn7 = CM(Ytest_s, pred_dnn7)
print("MLPC 7 layres DNN cm is {}".format(cm_dnn7))

# 7 layers dnn crossvalidation  
dnn = MLPClassifier(hidden_layer_sizes=(200,200,100,100,50,25,10,2),random_state = 420) # 提高神经网络数据到7层,每层神经元个数为# 200,200,100,100,50,25,10,2
dnn_std_7 = cross_val_score(dnn, X_std, target, cv=5)
print("dnn_std_7 cv=5 scores: ",dnn_std_7)
print("(200,200,100,100,50,25,10,2) training score mean: ",dnn_std_7.mean())
timerecord()

你可能感兴趣的:(机器学习,sklearn)