"""
Spyder Editor
Created on Wed Aug 23 10:18:27 2019
@author: lenovo
"""
import os
import pandas as pd
from sklearn import metrics
from sklearn import preprocessing
from sklearn import cross_validation
os.getcwd()
os.chdir('C:/Users/lenovo/Desktop/model_base')
data = pd.read_table('basic.txt',encoding='utf-8')
data.ix[:,2:42] = data.ix[:,2:42].fillna(data.ix[:,2:42].median())
train_x, test_x, train_y, test_y = cross_validation.train_test_split(data.ix[:,2:42], data.ix[:,42], test_size = 0.3,random_state=20)
enc_tra = preprocessing.OneHotEncoder()
enc_tra.fit(train_x)
tra = enc_tra.transform(train_x).toarray()
enc_tes = preprocessing.OneHotEncoder()
enc_tes.fit(test_x)
tes = enc_tes.transform(test_x).toarray()
from sklearn.linear_model import LogisticRegression
log_model= LogisticRegression(C=1.0, penalty='l1', tol=0.01)
log_model.fit(train_x,train_y)
print ('回归系数:', log_model.coef_)
log_pre =log_model.predict(test_x)
log_matrix=metrics.confusion_matrix(test_y,log_pre)
print (log_matrix)
print("KS : %f" % (log_matrix[0][0]/len(test_y[test_y==0]) - log_matrix[1][0]/len(test_y[test_y==1])))
roc = metrics.roc_auc_score(test_y,log_pre)
precision, recall, thresholds = metrics.precision_recall_curve(test_y, log_pre)
f1_score = metrics.f1_score(test_y, pred, average=None)
print(roc, precision, recall, f1_score)
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='gini')
model.fit(train_x,train_y)
model.score(train_x,train_y)
pred= model.predict(test_x)
tree_matrix=metrics.confusion_matrix(test_y,pred)
roc = metrics.roc_auc_score(test_y,pred)
precision, recall, thresholds = metrics.precision_recall_curve(test_y, pred)
f1_score = metrics.f1_score(test_y, pred, average=None)
print(roc, precision, recall, f1_score)
from sklearn import svm
model = svm.SVC(gamma='auto',
kernel='linear')
model.fit(train_x,train_y)
model.score(train_x,train_y)
predicted= model.predict(test_x)
svm_matrix=metrics.confusion_matrix(test_y,predicted)
roc = metrics.roc_auc_score(test_y,predicted)
precision, recall, thresholds = metrics.precision_recall_curve(test_y, predicted)
f1_score = metrics.f1_score(test_y, predicted, average=None)
print(roc, precision, recall, f1_score)
from sklearn.ensemble import RandomForestClassifier
rf0 = RandomForestClassifier(n_estimators=1000,
criterion='gini',
max_depth=5,
min_samples_split=5,
min_samples_leaf=5,
min_weight_fraction_leaf=0.1)
rf0.fit(train_x,train_y)
y_pre = rf0.predict(test_x)
y_proba = rf0.predict_proba(test_x)
rd_matrix=metrics.confusion_matrix(test_y,y_pre)
roc = metrics.roc_auc_score(test_y,y_pre )
precision, recall, thresholds = metrics.precision_recall_curve(test_y, y_pre )
f1_score = metrics.f1_score(test_y, y_pre , average=None)
print(roc, precision, recall, f1_score)
import xgboost as xgb
model = xgb.XGBClassifier(
objective='binary:logistic',
scale_pos_weight=6,
n_estimators=1000,
silent=0,
gamma=0,
max_delta_step=0,
max_depth=5,
reg_lambda=1,
reg_alpha=1,
subsample=0.7,
colsample_bytree=0.7,
colsample_bylevel=1,
min_child_weight=1,
learning_rate=0.01,
seed=50,
nthread=8)
model.fit(train_x,train_y,verbose=True,eval_metric='auc',eval_set = [(test_x, test_y)],early_stopping_rounds=50)
preds = model.predict(test_x)
proba = model.predict_proba(test_x)
preds_matrix = metrics.confusion_matrix(test_y,preds)
print(preds_matrix)
roc = metrics.roc_auc_score(test_y,y_pre )
precision, recall, thresholds = metrics.precision_recall_curve(test_y, y_pre )
f1_score = metrics.f1_score(test_y, y_pre , average=None)
print(roc, precision, recall, f1_score)
from sklearn.ensemble import GradientBoostingClassifier
gbr = GradientBoostingClassifier(n_estimators=1000,
max_depth=4,
min_samples_split=2,
learning_rate=0.1,
subsample=0.6,
random_state=10)
gbr.fit(x_train, y_train.ravel())
joblib.dump(gbr, 'train_model_result.m')
y_gbr1 = gbr.predict(x_test)
acc_test = gbr.score(x_test, y_test)
roc = metrics.roc_auc_score(test_y,y_pre )
precision, recall, thresholds = metrics.precision_recall_curve(test_y, y_pre )
f1_score = metrics.f1_score(test_y, y_pre , average=None)
print(acc_test, roc, precision, recall, f1_score)
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2', 'auc'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 1 }
gbm = lgb.train(params,
lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
early_stopping_rounds=5)
gbm.save_model('model.txt')
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred = gbm.predict(x_train, pred_leaf=True)
y_pred_prob = gbm.predict(x_train)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
acc_test = gbr.score(x_test, y_test)
roc = metrics.roc_auc_score(test_y,y_pre )
precision, recall, thresholds = metrics.precision_recall_curve(test_y, y_pre )
f1_score = metrics.f1_score(test_y, y_pre , average=None)
print(rmse, acc_test, roc, precision, recall, f1_score)