# -*- coding: utf-8 -*-
"""
Created on Wed Oct 13 17:05:46 2021
@author: Gaodun
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor,XGBClassifier
from collections import Counter
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from odps import ODPS
from odps.df import DataFrame
# 建立链接
# 读取数据。
sql = '''
SELECT
a.*,b.target as target2
FROM (select * from
op_bi_study_time_701new) a
left join (select target,student_id from op_bi_stu_summary_816new) b on a.student_id1=b.student_id
;
'''
query_job = o.execute_sql(sql)
result = query_job.open_reader(tunnel=True)
df = result.to_pandas(n_process=1)
import datetime
def get_yesterday():
yesterday = datetime.date.today() + datetime.timedelta(-1)
yesterday = datetime.datetime.strptime(str(yesterday), '%Y-%m-%d').strftime('%Y%m%d')
return yesterday
a= get_yesterday()
sql2 = '''
SELECT
*
FROM
op_ads_usr_std_final_info
where ds =%s and target=0
;
''' %a
query_job2 = o.execute_sql(sql2)
result2 = query_job2.open_reader(tunnel=True)
df2 = result2.to_pandas(n_process=1)
df['15d_commu_cnts'].fillna(0,inplace=True)
df['15d_commu_time'].fillna(0,inplace=True)
df['15d_complain_cnts'].fillna(0,inplace=True)
df.appraise.fillna(df.appraise.mean(),inplace=True)
df.live_num.fillna(0,inplace=True)
df['month_commu_cnts'].fillna(0,inplace=True)
df['month_commu_time'].fillna(0,inplace=True)
df['month_complain_cnts'].fillna(0,inplace=True)
df.live_num.fillna(0,inplace=True)
df['prev_commu_cnts'].fillna(0,inplace=True)
df['prev_commu_time'].fillna(0,inplace=True)
df['prev_complain_cnts'].fillna(0,inplace=True)
df['video_num'].fillna(0,inplace=True)
df['live_num'].fillna(0,inplace=True)
df['material_num'].fillna(0,inplace=True)
df['paper_num'].fillna(0,inplace=True)
df['pass_cdt'].fillna(0,inplace=True)
df['pass_rate'].fillna(0,inplace=True)
gbr = df.groupby("target")
gbr.groups
typicalFracDict = {
0: 0.5,
1: 0.5
}
def typicalSampling(group, typicalFracDict):
name = group.name
frac = typicalFracDict[name]
return group.sample(frac=frac)
df_stratified = df.groupby(
'target', group_keys=False).apply(typicalSampling, typicalFracDict)
X=df_stratified[[
'15d_frequency',
'growth_frequency',
'30d_video',
'prev_video',
'month_live',
'month_video',
'video_num',
'pay_price',
'prev_correct',
'days_to_end',
'paper_num',
'correct_rate',
'study_cnts',
'month_correct',
'month_material',
'prev_live',
'prev_material',
'month_paper',
'days_to_start',
'appraise',
'material_num',
'live_num']]
Y=df_stratified.target2
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=33)
params=params={'learning_rate':np.linspace(0.01,0.10,5),
'max_depth':[x for x in range(1,10,2)],
'n_estimators':[x for x in range(100,300,100)]}
clf = XGBClassifier()
grid = GridSearchCV(clf, params, cv=3, scoring="accuracy")
grid.fit(X_train,Y_train)
grid.best_score_
grid.best_params_
grid.best_estimator_
best_model=grid.best_estimator_
predict_y=best_model.predict(X_test)
metrics.f1_score(Y_test, predict_y)
metrics.accuracy_score(Y_test, predict_y)
import joblib
joblib.dump(best_model, 'model2.pkl')
----------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor,XGBClassifier
from collections import Counter
#data load
df=pd.read_excel('data.xlsx')
df.to_excel('data1.xlsx')
df.note_num=df.note_num.fillna(0)
df.info()
df=df.drop(["student_id","company_name",
"company_department",
"company_nature",
"province_name",
"city_name",
"growth_correct",
"growth_note",
"growth_matrial",
"growth_paper",
"growth_question",
"growth_live",
"growth_video",
"15d_correct",
"grow7d_frequency",
"grow7d_correct",
"grow7d_video",
"grow7d_note",
"grow15d_frequency",
"grow15d_correct",
"grow15d_video",
"grow15d_note"
],axis=1)
df["job_status_name"].fillna("未知",inplace=True)
df["school_highest_edu_name"].fillna("未知",inplace=True)
df.school_type.fillna("未知",inplace=True)
df.school_name.fillna("未知",inplace=True)
df.professional.fillna("未知",inplace=True)
df.school_enter_time.fillna("未知",inplace=True)
df.growth_frequency.fillna(0,inplace=True)
df.note_num.fillna(0,inplace=True)
df.question_num.fillna(0,inplace=True)
df.appraise.fillna(df.appraise.mean(),inplace=True)
df.whether_answer.fillna("N",inplace=True)
#%%数据处理
#get dummies
df = df.join(pd.get_dummies(df.gender))
#labeling
ordering = [ '<18', '18~21','22~25','>25']
df['age']=df['age'].map(lambda x: ordering.index(x))
ordering2 = [ 0,1,2,3,4,5,6,7]
df['relationship_level']=df['relationship_level'].map(lambda x: ordering2.index(x))
print(df["period"])
ordering3 = [0,1.5,3,4,6,8,9,12,18,24]
df['period']=df['period'].map(lambda x: ordering3.index(x))
print(df["whether_answer"])
ordering4 = ['N', 0.0, 0.5,1.0]
df['whether_answer']=df['whether_answer'].map(lambda x: ordering4.index(x))
#%%cut
def cal_Chi2(df):
res = []
num_sum = sum(df.values.flatten())
for i in range(df.shape[0]):
for j in range(df.shape[1]):
e = sum(df.iloc[i,:])*sum(df.iloc[:,j])/num_sum
tt = (df.iloc[i,j]-e)**2/e
res.append(tt)
return sum(res)
def line_merge(df,i,j):
df.iloc[i, 1] = df.iloc[i, 1] + df.iloc[j, 1]
df.iloc[i, 2] = df.iloc[i, 2] + df.iloc[j, 2]
df.iloc[i, 0] = df.iloc[j, 0]
df = pd.concat([df.iloc[:j,:],df.iloc[j+1:,:]])
return df
def ChiMerge(df, variable, flag, confidenceVal=3.841, bin=10):
regroup = df.groupby([variable])[flag].agg(["size","sum"])
regroup.columns = ['total_num','positive_class']
regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 统计需分箱变量每个值负样本数
regroup = regroup.drop('total_num', axis=1).reset_index()
col_names = regroup.columns
print('已完成数据读入,正在计算数据初处理')
i = 0
while (i <= regroup.shape[0] - 2):
if sum(regroup.iloc[[i,i+1],[1,2]].sum()==0) >0 :
regroup = line_merge(regroup,i,i+1)
i = i - 1
i = i + 1
# 对相邻两个区间进行卡方值计算
chi_ls = [] # 创建一个数组保存相邻两个区间的卡方值
for i in np.arange(regroup.shape[0] - 1):
chi = cal_Chi2(regroup.iloc[[i,i+1],[1,2]])
chi_ls.append(chi)
print('已完成数据初处理,正在进行卡方分箱核心操作')
#把卡方值最小的两个区间进行合并(卡方分箱核心)
while True:
if (len(chi_ls) <= (bin - 1) and min(chi_ls) >= confidenceVal):
break
min_ind = chi_ls.index(min(chi_ls)) # 找出卡方值最小的位置索引
# 合并两个区间
regroup = line_merge(regroup,min_ind,min_ind+1)
if (min_ind == regroup.shape[0] - 1): # 最小值是最后两个区间的时候
# 计算合并后当前区间与前一个区间的卡方值并替换
chi_ls[min_ind - 1] = cal_Chi2(regroup.iloc[[min_ind,min_ind-1],[1,2]])
# 删除替换前的卡方值
del chi_ls[min_ind]
else:
# 计算合并后当前区间与前一个区间的卡方值并替换
chi_ls[min_ind - 1] = cal_Chi2(regroup.iloc[[min_ind,min_ind-1],[1,2]])
# 计算合并后当前区间与后一个区间的卡方值并替换
chi_ls[min_ind] = cal_Chi2(regroup.iloc[[min_ind,min_ind+1],[1,2]])
# 删除替换前的卡方值
del chi_ls[min_ind+ 1]
print('已完成卡方分箱核心操作,正在保存结果')
# 把结果保存成一个数据框
list_temp = []
for i in np.arange(regroup.shape[0]):
if i == 0:
x = '-inf'+'~'+ str(regroup.iloc[i,0])
elif i == regroup.shape[0] - 1:
x = str(regroup.iloc[i-1,0])+'+'
else:
x =str(regroup.iloc[i-1,0])+ '~'+str(regroup.iloc[i,0])
list_temp.append(x)
regroup[variable] = list_temp # 结果表第二列:区间
return regroup
#数据分类
label=['age','relationship_level','period','whether_answer']
cate_col = list(df.select_dtypes(include=['O']).columns)
num_col = [x for x in list(df.select_dtypes(include=['int64', 'float64']).columns) if x not in label]
df['target'] = df['status'].map({"active":0,"not active":1})
#调用函数
chi_result=dict()
for a in num_col:
chi_merge = ChiMerge(df, a,'target', confidenceVal=3.841, bin=6)
chi_result[a]=chi_merge
for i in cate_col:
print(pd.unique(i))
pd.DataFrame(chi_result.values()).to_txt("Cut.csv")
df[num_col].to_excel("num_data_for_cut.xlsx")
vals=chi_result.values()
pd.DataFrame(chi_result.values()).to_csv("Cut.csv")
#%%
import xlwt
import pandas as pd
def export_excel(export):
#将字典列表转换为DataFrame
pf = pd.DataFrame(list(export))
#指定字段顺序
order = ['road_name','bus_plate','timeline','road_type','site']
pf = pf[order]
#将列名替换为中文
columns_map = num_col
pf.rename(columns = columns_map,inplace = True)
#指定生成的Excel表格名称
file_path = pd.ExcelWriter('name.xlsx')
#替换空单元格
pf.fillna(' ',inplace = True)
#输出
pf.to_excel(file_path,encoding = 'utf-8',index = False)
#保存表格
file_path.save()
if __name__ == '__main__':
#将分析完成的列表导出为excel表格
export_excel((chi_resul)
#%%决策树分箱
# from sklearn.tree import DecisionTreeClassifier
# def decision_tree_bins(df:pd.DataFrame,x_name:str,y_name:str,max_leaf_num:int=6):
# """利用决策树获得最优分箱的边界值"""
# boundary = []
# x = df[x_name].values
# y = df[y_name].values
# clf = DecisionTreeClassifier(criterion='entropy', # 信息熵最小化准则划分
# max_leaf_nodes=max_leaf_num, # 最大叶子节点数
# min_samples_leaf = 0.05) # 叶子节点样本数量最小占比
# clf.fit(x.reshape(-1,1),y) # 训练决策树
# n_nodes = clf.tree_.node_count
# children_left = clf.tree_.children_left
# children_right = clf.tree_.children_right
# threshold = clf.tree_.threshold
# for i in range(n_nodes):
# if children_left[i] != children_right[i] : # 获的决策时节点上的划分边界
# boundary.append(threshold[i])
# boundary.sort()
# min_x = x.min()
# max_x = x.max() + 0.1 # 加0.1是为了考虑后续groupby操作时, 能包含特征最大值得样本
# boundary = boundary +[max_x]
# # 根据得到的边界值, 得到分箱结果
# df[x_name] = pd.cut(df[x_name],bins=boundary)
# # 查看分箱结果
# df = df.groupby(x_name,as_index=False)[y_name].agg(['size','sum'])
# df = df.reset_index()
# df.columns = [x_name,'num_all','positive_class']
# df['negative_class'] = df['num_all'] - df['positive_class']
# return df[[x_name,'positive_class','negative_class']]
# df['target'] = df['Attrition_Flag'].map({"Existing Customer":0,"Attrited Customer":1})
# df.drop('Attrition_Flag',axis=1,inplace=True)
# decision_tree_bins(df.copy(),'Months_on_book','target',max_leaf_num=6)
# #%%
# df.days_to_end=df.days_to_end*(-1)
# for i in ['pay_price', 'days_to_start', 'days_to_end', 'frequency']:
# group_names = ['0', '1', '2','3','4']
# print(df[i])
# print(pd.cut(df[i], bins=5, labels=group_names))
#%% active and not active comparison
active_value=df["active"].value_counts()
labels=df["active"].value_counts().index
plt.figure(figsize=(7,7))
plt.pie(active_value,labels=labels,colors=["b","w"], explode=(0.1,0),autopct='%.1f%%', shadow=True)
plt.title("active vs not active")
plt.show()
#%% correaltion between variables(用原始值)
corr_df = df.apply(lambda x: pd.factorize(x)[0])
corr_df.head()
corr=corr_df.corr()
corr.to_excel('corr.xlsx')
# heatmap
plt.figure(figsize=(35,32))
mask = np.abs(corr) <= 0.5
ax = sns.heatmap(corr,mask=mask, xticklabels=corr.columns, yticklabels=corr.columns,
linewidths=0.2, cmap="RdYlGn",annot=True)
plt.title("Correlation between variables")
plt.figure(figsize=(35,6))
corr['target'].sort_values(ascending=False).plot(kind='bar')
plt.title('Correlation between active and variables ')
a=corr['days_not_study']
a.to_excel('a.xlsx')
corr_df = num_data.apply(lambda x: pd.factorize(x)[0])
corr_df.head()
corr=num_data.corr()
# heatmap
plt.figure(figsize=(35,32))
mask = np.abs(corr) <= 0.5
ax = sns.heatmap(corr,mask=mask, xticklabels=corr.columns, yticklabels=corr.columns,
linewidths=0.2, cmap="RdYlGn",annot=True)
plt.title("Correlation between variables")
plt.figure(figsize=(35,6))
corr['days_not_study'].sort_values(ascending=False).plot(kind='bar')
plt.title('Correlation between active and variables ')
#%%数据标准化 normalization
from sklearn import preprocessing
df.school_type.fillna("未知",inplace=True)
num_data.fillna(0,inplace=True)
normalizer_data = preprocessing.Normalizer().fit_transform(num_data)
normalizer_data=pd.DataFrame(normalizer_data,columns=[num_colms])
d=normalizer_data.describe()
#sigmoid data
# def sigmoid(X,useStatus):
# if useStatus:
# return 1.0 / (1 + np.exp(-float(X)));
# else:
# return float(X)
#%% XGB model generating importance
from scipy import stats
from xgboost import XGBClassifier,XGBRegressor
X = df.drop(['target','days_not_study','status','gender','is_complete', 'job_status_name', 'professional', 'school_highest_edu_name','school_name','school_type','school_enter_time'],axis=1) #[['age', 'correct_rate', 'study_time']]
Y =df.days_not_study
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=33)
model_xgb= XGBRegressor()
model_xgb.fit(X_train,Y_train)
from xgboost import plot_importance
fig,ax = plt.subplots(figsize=(15,15))
plot_importance(model_xgb,height=0.5,max_num_features=20)
plt.show()
weight=model_xgb.get_booster().get_score(importance_type="weight")
weight=pd.Series(weight).sort_values(ascending=False)
weight/weight.sum()
# im=pd.DataFrame({'importance':model_xgb.feature_importances_,'var':X.columns})
# im=im.sort_values(by='importance',ascending=False)
# plot_importance(model_xgb,max_num_features=30)
#%% chi square test
kar_var=df[['job_status_name','city_name','professional','province_name',
'school_highest_edu_name', 'school_name', 'school_type','gender'
,'school_highest_education','days','active']]
kar_var=num_data
from scipy.stats import chi2_contingency
def KF(x):
df1=pd.crosstab(kar_var['days_not_study'],kar_var[x])
li1=list(df1.iloc[0,:])
li2=list(df1.iloc[1,:])
kf_data=np.array([li1,li2])
kf=chi2_contingency(kf_data)
if kf[1]<0.05:
print('Churn by {} 的卡方临界值是{:.2f},小于0.05,表明{}组间有显著性差异,可进行【交叉分析】'.format(x,kf[1],x),'\n')
else:
print('Churn by {} 的卡方临界值是{:.2f},大于0.05,表明{}组间无显著性差异,不可进行交叉分析'.format(x,kf[1],x),'\n')
print('kf_var的卡方检验结果如下:','\n')
print(list(map(KF, kar_var)))
list=map(KF, kar_var)
#plot
#%% 预测流失
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
X = num_data[['days_to_end',
'days_to_start',
'video_num',
'material_num',
'paper_num',
'pay_price',
'study_cnts',
'frequency',
'correct_rate',
'prev_frequency',
'month_frequency',
'growth_frequency']]
Y =num_data.days_not_study
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=33)
params=params={'learning_rate':np.linspace(0.01,0.10,5),
'max_depth':[x for x in range(1,10,1)],
'n_estimators':[x for x in range(200,500,100)]}
clf = XGBClassifier()
grid = GridSearchCV(clf, params, cv=3, scoring="accuracy")
grid.fit(X_train,Y_train)
grid.best_score_
grid.best_params_
grid.best_estimator_
best_model=grid.best_estimator_
predict_y=best_model.predict(X_test)
metrics.f1_score(Y_test, predict_y)
metrics.accuracy_score(Y_test, predict_y)
predict_proba_y=best_model.predict_proba(X_test)
aa={'Y_test':Y_test,'predict_y':predict_proba_y}
bb=pd.DataFrame(aa)
bb.to_excel('predict.xlsx')
#prediction rate
list=[]
for i in predict_proba_y[:,0]:
if i>=0.5:
list.append(i)
else:
list.append(1-i)
list=pd.DataFrame(list)
list.to_excel('predict rate.xlsx')
weigh2=best_model.get_booster().get_score(importance_type="weight")
weigh2=pd.Series(weigh2).sort_values(ascending=False)
weigh2/weigh2.sum()
#%%
X=num_data[['15d_frequency',
'growth_frequency',
'30d_video',
'prev_video',
'month_live',
'month_video',
'video_num',
'pay_price',
'prev_correct',
'days_to_end',
'paper_num',
'correct_rate',
'study_cnts',
'month_correct',
'month_material',
'prev_live',
'prev_material',
'month_paper',
'days_to_start',
'30d_correct',
'appraise',
'material_num',
'live_num',
'prev_question']]
# X=num_data.drop(['target','days_not_study'],axis=1)
Y =num_data.days_not_study
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=33)
params=params={'learning_rate':np.linspace(0.01,0.10,5),
'max_depth':[x for x in range(1,10,2)],
'n_estimators':[x for x in range(100,300,20)]}
clf = XGBClassifier()
grid = GridSearchCV(clf, params, cv=3, scoring="accuracy")
grid.fit(X_train,Y_train)
grid.best_score_
grid.best_params_
grid.best_estimator_
best_model=grid.best_estimator_
predict_y=best_model.predict(X_test)
smetrics.f1_score(Y_test, predict_y)
metrics.accuracy_score(Y_test, predict_y)
predict_proba_y=best_model.predict_proba(X_test)
#prediction rate
list=[]
for i in predict_proba_y[:,0]:
if i>=0.5:
list.append(i)
else:
list.append(1-i)
list=pd.DataFrame(list)
list.to_excel('predict rat2.xlsx')
weigh2=best_model.get_booster().get_score(importance_type="weight")
weigh2=pd.Series(weigh2).sort_values(ascending=False)
weigh2/weigh2.sum()
aa={'Y_test':Y_test,'predict_y':predict_y}
bb=pd.DataFrame(aa)
bb.to_excel('predict2.xlsx')