2019独角兽企业重金招聘Python工程师标准>>>
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
from sklearn.externals import joblib
import xgboost as xgb
loaded_model = joblib.load("xgb_best_model_0731_joblib.model")
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
class MultiOneHotEncoder:
def __init__(self, df, column_name_list):
self.df = df
self.column_name_list = column_name_list
def multi_column_encoder(self):
Enc_ohe, Enc_label = OneHotEncoder(), LabelEncoder()
for column_name in self.column_name_list:
self.df["Dummies"] = Enc_label.fit_transform(self.df[column_name])
self.df_dummies = pd.DataFrame(Enc_ohe.fit_transform(self.df[["Dummies"]]).todense(),
columns=Enc_label.classes_)
self.df_dummies.rename(columns=lambda x: column_name + "_" + x, inplace=True)
self.df = pd.concat([self.df, self.df_dummies], axis=1)
self.df.drop(["Dummies"], axis=1, inplace=True)
self.df.drop(self.column_name_list, axis=1, inplace=True)
return self.df
test = pd.read_csv("test_info_0903_001", sep=',', header=None, converters={0:str}, na_values=['Null','null','NULL'])
column_list = ["user_id","position_status","gender","highest_degree","age","seniority","marital_status",
"latest_position","next-to-last_position","last-but-two_position","leave_hours",
"recent-30days_leave_times","recent-60days_leave_times",
"recent-30days_email_outbreaks","recent-30days_single_email_outbreaks",
"recent-60days_email_outbreaks","recent-60days_single_email_outbreaks"]
test.columns = column_list
drop_column_list = ['latest_position', 'next-to-last_position', 'last-but-two_position']
test.drop(drop_column_list, axis=1, inplace=True)
test = test[(test['age']>=18) & (test['age']<80) & (~test['highest_degree'].isin(['初中及以下']))]
values = {'highest_degree':'missing'}
test = test.fillna(value=values)
test['position_status'] = test['position_status'].apply(lambda x: 1 if x=='离职' else 0)
test = test.reset_index(drop=True)
string_column = test.loc[:, test.dtypes == 'object'].columns
column_name_list = list(string_column)
remove_column_list = ['user_id']
for var in remove_column_list:
column_name_list.remove(var)
data = MultiOneHotEncoder(test, column_name_list).multi_column_encoder()
X_pred = data.drop(['user_id', 'position_status'], axis=1)
"""
xgb_x = xgb.DMatrix(X_pred)
y_pred = loaded_model.predict(xgb_x, ntree_limit=loaded_model.best_ntree_limit)
result = pd.DataFrame({'Actual': data.position_status, 'Prob': y_pred})
"""
############
column_list = loaded_model.feature_names ##输出模型特征
new_X_pred = X_pred.reindex(columns=column_list, fill_value=0) ###有点类似多退少补的概念
##One-hot之后不论是缺少字段,还是多出字段,用此种方式均可以正确输出预测值,不会报错
new_xgb_x = xgb.DMatrix(new_X_pred)
new_y_pred = loaded_model.predict(new_xgb_x, ntree_limit=loaded_model.best_ntree_limit) ##加载best_ntree_limit
result = pd.DataFrame({'Actual': data.position_status, 'Prob': new_y_pred})
result['user_id'] = data['user_id'] ##
result.to_csv("just_test_pred_info_0903_001.csv", index=False)