常用机器学习算法模型简版代码

# -*- coding: utf-8 -*-
"""
Spyder Editor
Created on Wed Aug 23 10:18:27 2019
@author: lenovo
"""
import os
import pandas as pd
from sklearn import metrics
from sklearn import preprocessing
from sklearn import cross_validation

os.getcwd()
os.chdir('C:/Users/lenovo/Desktop/model_base')

data = pd.read_table('basic.txt',encoding='utf-8')
data.ix[:,2:42] = data.ix[:,2:42].fillna(data.ix[:,2:42].median())
train_x, test_x, train_y, test_y = cross_validation.train_test_split(data.ix[:,2:42], data.ix[:,42], test_size = 0.3,random_state=20)

#onehot编码(not used)
enc_tra = preprocessing.OneHotEncoder()
enc_tra.fit(train_x)
tra = enc_tra.transform(train_x).toarray()
enc_tes = preprocessing.OneHotEncoder()
enc_tes.fit(test_x)
tes = enc_tes.transform(test_x).toarray()

#LogisticRegression
from sklearn.linear_model import LogisticRegression
# 参数C为正则化强度的倒数,用于控制回归系数的复杂度。
log_model= LogisticRegression(C=1.0, penalty='l1', tol=0.01)
# penalty:惩罚项类型,tol :停止训练的误差值大小,默认为1e-3
log_model.fit(train_x,train_y)
print ('回归系数:', log_model.coef_)
log_pre =log_model.predict(test_x)
log_matrix=metrics.confusion_matrix(test_y,log_pre)
print (log_matrix)
print("KS : %f" % (log_matrix[0][0]/len(test_y[test_y==0]) - log_matrix[1][0]/len(test_y[test_y==1])))  
# print(metrics.precision_score(test_y, log_pre, average=None))
# print(metrics.recall_score(test_y, log_pre, average=None))
roc = metrics.roc_auc_score(test_y,log_pre)
precision, recall, thresholds = metrics.precision_recall_curve(test_y, log_pre)
f1_score = metrics.f1_score(test_y, pred, average=None)
print(roc, precision, recall, f1_score)

# tree
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='gini') 
model.fit(train_x,train_y)
model.score(train_x,train_y)
# predict
pred= model.predict(test_x)
tree_matrix=metrics.confusion_matrix(test_y,pred)
# multi-objective synthetic evaluation
roc = metrics.roc_auc_score(test_y,pred)
precision, recall, thresholds = metrics.precision_recall_curve(test_y, pred)
f1_score = metrics.f1_score(test_y, pred, average=None)
print(roc, precision, recall, f1_score)

# svm
from sklearn import svm
model = svm.SVC(gamma='auto',
				kernel='linear')
model.fit(train_x,train_y)
model.score(train_x,train_y)
# predict
predicted= model.predict(test_x)
svm_matrix=metrics.confusion_matrix(test_y,predicted)
# multi-objective synthetic evaluation
roc = metrics.roc_auc_score(test_y,predicted)
precision, recall, thresholds = metrics.precision_recall_curve(test_y, predicted)
f1_score = metrics.f1_score(test_y, predicted, average=None)
print(roc, precision, recall, f1_score)

# randomforest
from sklearn.ensemble import RandomForestClassifier
rf0 = RandomForestClassifier(n_estimators=1000,
							 criterion='gini',
		                     max_depth=5, 
		                     min_samples_split=5,
		                     min_samples_leaf=5,
		                     min_weight_fraction_leaf=0.1)  
rf0.fit(train_x,train_y)  
# predict
y_pre = rf0.predict(test_x)
y_proba = rf0.predict_proba(test_x)
rd_matrix=metrics.confusion_matrix(test_y,y_pre)
#  multi-objective synthetic evaluation
roc = metrics.roc_auc_score(test_y,y_pre )
precision, recall, thresholds = metrics.precision_recall_curve(test_y, y_pre )
f1_score = metrics.f1_score(test_y, y_pre , average=None)
print(roc, precision, recall, f1_score)

# xgboost
import xgboost as xgb
model = xgb.XGBClassifier(
        objective='binary:logistic',
        scale_pos_weight=6,
        n_estimators=1000,
        silent=0,
        gamma=0,
        max_delta_step=0,
        max_depth=5,
        reg_lambda=1,
        reg_alpha=1,
        subsample=0.7,
        colsample_bytree=0.7,
        colsample_bylevel=1,
        min_child_weight=1,
        learning_rate=0.01,
        seed=50,
        nthread=8)
model.fit(train_x,train_y,verbose=True,eval_metric='auc',eval_set = [(test_x, test_y)],early_stopping_rounds=50)
# predict
preds = model.predict(test_x)
proba = model.predict_proba(test_x)
preds_matrix = metrics.confusion_matrix(test_y,preds)
print(preds_matrix)
# multi-objective synthetic evaluation
roc = metrics.roc_auc_score(test_y,y_pre )
precision, recall, thresholds = metrics.precision_recall_curve(test_y, y_pre )
f1_score = metrics.f1_score(test_y, y_pre , average=None)
print(roc, precision, recall, f1_score)

# GBDT
from sklearn.ensemble import GradientBoostingClassifier
gbr = GradientBoostingClassifier(n_estimators=1000, 
								 max_depth=4, 
								 min_samples_split=2, 
								 learning_rate=0.1,
								 subsample=0.6,
								 random_state=10)
gbr.fit(x_train, y_train.ravel())
joblib.dump(gbr, 'train_model_result.m')
# predict
y_gbr1 = gbr.predict(x_test)
# multi-objective synthetic evaluation
acc_test = gbr.score(x_test, y_test)
roc = metrics.roc_auc_score(test_y,y_pre )
precision, recall, thresholds = metrics.precision_recall_curve(test_y, y_pre )
f1_score = metrics.f1_score(test_y, y_pre , average=None)
print(acc_test, roc, precision, recall, f1_score)

# lightgbm 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
params = {
    'task': 'train',
    'boosting_type': 'gbdt', 
    'objective': 'regression',
    'metric': {'l2', 'auc'}, 
    'num_leaves': 31, 
    'learning_rate': 0.05, 
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8,
    'bagging_freq': 5,  
    'verbose': 1 }
gbm = lgb.train(params,
				lgb_train,
				num_boost_round=20,
				valid_sets=lgb_eval,
				early_stopping_rounds=5)
gbm.save_model('model.txt') 
 
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) #如果在训练期间启用了早期停止,可以通过best_iteration方式从最佳迭代中获得预测
y_pred = gbm.predict(x_train, pred_leaf=True)
y_pred_prob = gbm.predict(x_train)
# multi-objective synthetic evaluation
rmse = mean_squared_error(y_test, y_pred) ** 0.5
acc_test = gbr.score(x_test, y_test)
roc = metrics.roc_auc_score(test_y,y_pre )
precision, recall, thresholds = metrics.precision_recall_curve(test_y, y_pre )
f1_score = metrics.f1_score(test_y, y_pre , average=None)
print(rmse, acc_test, roc, precision, recall, f1_score)


你可能感兴趣的:(机器学习篇,机器学习,算法,统计模型,python,svm)