XGB

# -*- coding: utf-8 -*-

"""

Created on Wed Oct 13 17:05:46 2021

@author: Gaodun

"""

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from xgboost.sklearn import XGBRegressor,XGBClassifier

from collections import Counter

from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

from sklearn import metrics

from odps import ODPS

from odps.df import DataFrame

# 建立链接


# 读取数据。

sql = '''

SELECT 

    a.*,b.target as target2

FROM (select * from

op_bi_study_time_701new) a

left join (select target,student_id from op_bi_stu_summary_816new) b on a.student_id1=b.student_id

;

'''

query_job = o.execute_sql(sql)

result = query_job.open_reader(tunnel=True)

df = result.to_pandas(n_process=1)

import datetime

def get_yesterday():

    yesterday = datetime.date.today() + datetime.timedelta(-1)

    yesterday = datetime.datetime.strptime(str(yesterday), '%Y-%m-%d').strftime('%Y%m%d')

    return yesterday

a= get_yesterday()

sql2 = '''

SELECT 

    *

FROM


        op_ads_usr_std_final_info

        where ds =%s and target=0

;

''' %a

query_job2 = o.execute_sql(sql2)

result2 = query_job2.open_reader(tunnel=True)

df2 = result2.to_pandas(n_process=1)

df['15d_commu_cnts'].fillna(0,inplace=True)

df['15d_commu_time'].fillna(0,inplace=True)

df['15d_complain_cnts'].fillna(0,inplace=True)

df.appraise.fillna(df.appraise.mean(),inplace=True)

df.live_num.fillna(0,inplace=True)

df['month_commu_cnts'].fillna(0,inplace=True)

df['month_commu_time'].fillna(0,inplace=True)

df['month_complain_cnts'].fillna(0,inplace=True)

df.live_num.fillna(0,inplace=True)

df['prev_commu_cnts'].fillna(0,inplace=True)

df['prev_commu_time'].fillna(0,inplace=True)

df['prev_complain_cnts'].fillna(0,inplace=True)

df['video_num'].fillna(0,inplace=True)

df['live_num'].fillna(0,inplace=True)

df['material_num'].fillna(0,inplace=True)

df['paper_num'].fillna(0,inplace=True)

df['pass_cdt'].fillna(0,inplace=True)

df['pass_rate'].fillna(0,inplace=True)

gbr = df.groupby("target")

gbr.groups

typicalFracDict = {

    0: 0.5,

    1: 0.5

}

def typicalSampling(group, typicalFracDict):

    name = group.name

    frac = typicalFracDict[name]

    return group.sample(frac=frac)

df_stratified = df.groupby(

    'target', group_keys=False).apply(typicalSampling, typicalFracDict)

X=df_stratified[[

'15d_frequency',

'growth_frequency',

'30d_video',

'prev_video',

'month_live',

'month_video',

'video_num',

'pay_price',

'prev_correct',

'days_to_end',

'paper_num',

'correct_rate',

'study_cnts',

'month_correct',

'month_material',

'prev_live',

'prev_material',

'month_paper',

'days_to_start',

'appraise',

'material_num',

'live_num']]

Y=df_stratified.target2

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=33)

params=params={'learning_rate':np.linspace(0.01,0.10,5),

                'max_depth':[x for x in range(1,10,2)],

                'n_estimators':[x for x in range(100,300,100)]}

clf = XGBClassifier()

grid = GridSearchCV(clf, params, cv=3, scoring="accuracy")

grid.fit(X_train,Y_train)

grid.best_score_   

grid.best_params_ 

grid.best_estimator_

best_model=grid.best_estimator_

predict_y=best_model.predict(X_test)

metrics.f1_score(Y_test, predict_y)

metrics.accuracy_score(Y_test, predict_y)

import joblib

joblib.dump(best_model, 'model2.pkl')



----------------

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split

from xgboost.sklearn import XGBRegressor,XGBClassifier

from collections import Counter

#data load

df=pd.read_excel('data.xlsx')

df.to_excel('data1.xlsx')

df.note_num=df.note_num.fillna(0)

df.info()

df=df.drop(["student_id","company_name",

"company_department",

"company_nature",

"province_name",

"city_name",

"growth_correct",

"growth_note",

"growth_matrial",

"growth_paper",

"growth_question",

"growth_live",

"growth_video",

"15d_correct",

"grow7d_frequency",

"grow7d_correct",

"grow7d_video",

"grow7d_note",

"grow15d_frequency",

"grow15d_correct",

"grow15d_video",

"grow15d_note"

],axis=1)

df["job_status_name"].fillna("未知",inplace=True)

df["school_highest_edu_name"].fillna("未知",inplace=True)

df.school_type.fillna("未知",inplace=True)

df.school_name.fillna("未知",inplace=True)

df.professional.fillna("未知",inplace=True)

df.school_enter_time.fillna("未知",inplace=True)

df.growth_frequency.fillna(0,inplace=True)

df.note_num.fillna(0,inplace=True)

df.question_num.fillna(0,inplace=True)

df.appraise.fillna(df.appraise.mean(),inplace=True)

df.whether_answer.fillna("N",inplace=True)

#%%数据处理

#get dummies

df = df.join(pd.get_dummies(df.gender))

#labeling

ordering = [ '<18', '18~21','22~25','>25']

df['age']=df['age'].map(lambda x: ordering.index(x))

ordering2 = [ 0,1,2,3,4,5,6,7]

df['relationship_level']=df['relationship_level'].map(lambda x: ordering2.index(x))

print(df["period"])

ordering3 = [0,1.5,3,4,6,8,9,12,18,24]

df['period']=df['period'].map(lambda x: ordering3.index(x))

print(df["whether_answer"])

ordering4 = ['N', 0.0, 0.5,1.0]

df['whether_answer']=df['whether_answer'].map(lambda x: ordering4.index(x))

#%%cut

def cal_Chi2(df):

    res = [] 

    num_sum = sum(df.values.flatten()) 

    for i in range(df.shape[0]):

        for j in range(df.shape[1]):

            e = sum(df.iloc[i,:])*sum(df.iloc[:,j])/num_sum

            tt = (df.iloc[i,j]-e)**2/e

            res.append(tt)

    return sum(res)



def line_merge(df,i,j):

    df.iloc[i, 1] = df.iloc[i, 1] + df.iloc[j, 1]

    df.iloc[i, 2] = df.iloc[i, 2] + df.iloc[j, 2]

    df.iloc[i, 0] = df.iloc[j, 0]

    df = pd.concat([df.iloc[:j,:],df.iloc[j+1:,:]])

    return df 

def ChiMerge(df, variable, flag, confidenceVal=3.841, bin=10): 

    regroup = df.groupby([variable])[flag].agg(["size","sum"])

    regroup.columns = ['total_num','positive_class']

    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']  # 统计需分箱变量每个值负样本数

    regroup = regroup.drop('total_num', axis=1).reset_index()

    col_names = regroup.columns 

    print('已完成数据读入,正在计算数据初处理')

    i = 0

    while (i <= regroup.shape[0] - 2):

        if sum(regroup.iloc[[i,i+1],[1,2]].sum()==0) >0 :   

            regroup = line_merge(regroup,i,i+1)           

            i = i - 1

        i = i + 1


    # 对相邻两个区间进行卡方值计算

    chi_ls = []  # 创建一个数组保存相邻两个区间的卡方值

    for i in np.arange(regroup.shape[0] - 1):

        chi = cal_Chi2(regroup.iloc[[i,i+1],[1,2]])       

        chi_ls.append(chi) 


    print('已完成数据初处理,正在进行卡方分箱核心操作')

    #把卡方值最小的两个区间进行合并(卡方分箱核心)

    while True:

        if (len(chi_ls) <= (bin - 1) and min(chi_ls) >= confidenceVal):

            break 

        min_ind = chi_ls.index(min(chi_ls))  # 找出卡方值最小的位置索引

#      合并两个区间

        regroup = line_merge(regroup,min_ind,min_ind+1) 


        if (min_ind == regroup.shape[0] - 1):  # 最小值是最后两个区间的时候

            # 计算合并后当前区间与前一个区间的卡方值并替换           

            chi_ls[min_ind - 1] = cal_Chi2(regroup.iloc[[min_ind,min_ind-1],[1,2]]) 

            # 删除替换前的卡方值

            del chi_ls[min_ind]           

        else:

            # 计算合并后当前区间与前一个区间的卡方值并替换

            chi_ls[min_ind - 1] = cal_Chi2(regroup.iloc[[min_ind,min_ind-1],[1,2]])


            # 计算合并后当前区间与后一个区间的卡方值并替换

            chi_ls[min_ind] = cal_Chi2(regroup.iloc[[min_ind,min_ind+1],[1,2]])


            # 删除替换前的卡方值

            del chi_ls[min_ind+ 1] 


    print('已完成卡方分箱核心操作,正在保存结果')

    # 把结果保存成一个数据框


    list_temp = []

    for i in np.arange(regroup.shape[0]):

        if i == 0:

            x = '-inf'+'~'+ str(regroup.iloc[i,0])

        elif i == regroup.shape[0] - 1:

            x = str(regroup.iloc[i-1,0])+'+' 

        else:

            x =str(regroup.iloc[i-1,0])+ '~'+str(regroup.iloc[i,0])

        list_temp.append(x)

    regroup[variable] = list_temp  # 结果表第二列:区间

    return regroup

#数据分类

label=['age','relationship_level','period','whether_answer']

cate_col = list(df.select_dtypes(include=['O']).columns)

num_col = [x for x in list(df.select_dtypes(include=['int64', 'float64']).columns) if x not in label]

df['target'] = df['status'].map({"active":0,"not active":1})

#调用函数

chi_result=dict()

for a in num_col:

    chi_merge = ChiMerge(df, a,'target', confidenceVal=3.841, bin=6)

    chi_result[a]=chi_merge

for i in cate_col:

    print(pd.unique(i))


pd.DataFrame(chi_result.values()).to_txt("Cut.csv")

df[num_col].to_excel("num_data_for_cut.xlsx")

vals=chi_result.values()

pd.DataFrame(chi_result.values()).to_csv("Cut.csv")

#%%

import xlwt

import pandas as pd

def export_excel(export):

  #将字典列表转换为DataFrame

  pf = pd.DataFrame(list(export))

  #指定字段顺序

  order = ['road_name','bus_plate','timeline','road_type','site']

  pf = pf[order]

  #将列名替换为中文

  columns_map = num_col

  pf.rename(columns = columns_map,inplace = True)

  #指定生成的Excel表格名称

  file_path = pd.ExcelWriter('name.xlsx')

  #替换空单元格

  pf.fillna(' ',inplace = True)

  #输出

  pf.to_excel(file_path,encoding = 'utf-8',index = False)

  #保存表格

  file_path.save()

if __name__ == '__main__':

    #将分析完成的列表导出为excel表格

    export_excel((chi_resul)

#%%决策树分箱

# from sklearn.tree import DecisionTreeClassifier 

# def decision_tree_bins(df:pd.DataFrame,x_name:str,y_name:str,max_leaf_num:int=6):

#    """利用决策树获得最优分箱的边界值""" 

#    boundary = [] 

#    x = df[x_name].values 

#    y = df[y_name].values

#    clf = DecisionTreeClassifier(criterion='entropy',  # 信息熵最小化准则划分

#                                  max_leaf_nodes=max_leaf_num,  # 最大叶子节点数 

#                                  min_samples_leaf = 0.05)  # 叶子节点样本数量最小占比

#    clf.fit(x.reshape(-1,1),y)  # 训练决策树


#    n_nodes = clf.tree_.node_count 

#    children_left = clf.tree_.children_left 

#    children_right = clf.tree_.children_right 

#    threshold = clf.tree_.threshold 


#    for i in range(n_nodes):

#        if children_left[i] != children_right[i] : # 获的决策时节点上的划分边界

#            boundary.append(threshold[i])

#    boundary.sort()

#    min_x = x.min() 

#    max_x = x.max() + 0.1 # 加0.1是为了考虑后续groupby操作时, 能包含特征最大值得样本

#    boundary = boundary +[max_x] 


#    # 根据得到的边界值, 得到分箱结果

#    df[x_name] = pd.cut(df[x_name],bins=boundary)   


#    # 查看分箱结果

#    df = df.groupby(x_name,as_index=False)[y_name].agg(['size','sum'])

#    df = df.reset_index()

#    df.columns = [x_name,'num_all','positive_class']

#    df['negative_class'] = df['num_all'] - df['positive_class']


#    return df[[x_name,'positive_class','negative_class']]   

# df['target'] = df['Attrition_Flag'].map({"Existing Customer":0,"Attrited Customer":1})

# df.drop('Attrition_Flag',axis=1,inplace=True) 

# decision_tree_bins(df.copy(),'Months_on_book','target',max_leaf_num=6)

# #%%

# df.days_to_end=df.days_to_end*(-1)

# for i in ['pay_price', 'days_to_start', 'days_to_end', 'frequency']:

#    group_names = ['0', '1', '2','3','4']

#    print(df[i])

#    print(pd.cut(df[i], bins=5, labels=group_names))

#%% active and not active comparison

active_value=df["active"].value_counts()

labels=df["active"].value_counts().index

plt.figure(figsize=(7,7))

plt.pie(active_value,labels=labels,colors=["b","w"], explode=(0.1,0),autopct='%.1f%%', shadow=True)

plt.title("active vs not active")

plt.show() 

#%% correaltion between variables(用原始值)

corr_df = df.apply(lambda x: pd.factorize(x)[0])

corr_df.head()

corr=corr_df.corr()

corr.to_excel('corr.xlsx')

# heatmap

plt.figure(figsize=(35,32))

mask = np.abs(corr) <= 0.5

ax = sns.heatmap(corr,mask=mask, xticklabels=corr.columns, yticklabels=corr.columns,

                linewidths=0.2, cmap="RdYlGn",annot=True)

plt.title("Correlation between variables")

plt.figure(figsize=(35,6))

corr['target'].sort_values(ascending=False).plot(kind='bar')

plt.title('Correlation between active and variables ')

a=corr['days_not_study']

a.to_excel('a.xlsx')

corr_df = num_data.apply(lambda x: pd.factorize(x)[0])

corr_df.head()

corr=num_data.corr()

# heatmap

plt.figure(figsize=(35,32))

mask = np.abs(corr) <= 0.5

ax = sns.heatmap(corr,mask=mask, xticklabels=corr.columns, yticklabels=corr.columns,

                linewidths=0.2, cmap="RdYlGn",annot=True)

plt.title("Correlation between variables")

plt.figure(figsize=(35,6))

corr['days_not_study'].sort_values(ascending=False).plot(kind='bar')

plt.title('Correlation between active and variables ')

#%%数据标准化 normalization

from sklearn import preprocessing

df.school_type.fillna("未知",inplace=True)

num_data.fillna(0,inplace=True)

normalizer_data = preprocessing.Normalizer().fit_transform(num_data)

normalizer_data=pd.DataFrame(normalizer_data,columns=[num_colms])

d=normalizer_data.describe()

#sigmoid data

# def sigmoid(X,useStatus):

#    if useStatus:

#        return 1.0 / (1 + np.exp(-float(X)));

#    else:

#        return float(X)


#%% XGB model generating importance

from scipy import stats

from xgboost import XGBClassifier,XGBRegressor

X = df.drop(['target','days_not_study','status','gender','is_complete', 'job_status_name', 'professional', 'school_highest_edu_name','school_name','school_type','school_enter_time'],axis=1) #[['age', 'correct_rate', 'study_time']]

Y =df.days_not_study

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=33)

model_xgb= XGBRegressor()

model_xgb.fit(X_train,Y_train)

from xgboost import plot_importance

fig,ax = plt.subplots(figsize=(15,15))

plot_importance(model_xgb,height=0.5,max_num_features=20)

plt.show()

weight=model_xgb.get_booster().get_score(importance_type="weight")

weight=pd.Series(weight).sort_values(ascending=False)

weight/weight.sum()

# im=pd.DataFrame({'importance':model_xgb.feature_importances_,'var':X.columns})

# im=im.sort_values(by='importance',ascending=False)

# plot_importance(model_xgb,max_num_features=30)

#%% chi square test

kar_var=df[['job_status_name','city_name','professional','province_name',

          'school_highest_edu_name', 'school_name', 'school_type','gender'

          ,'school_highest_education','days','active']]

kar_var=num_data

from scipy.stats import chi2_contingency 

def KF(x):

    df1=pd.crosstab(kar_var['days_not_study'],kar_var[x])

    li1=list(df1.iloc[0,:])

    li2=list(df1.iloc[1,:])

    kf_data=np.array([li1,li2])

    kf=chi2_contingency(kf_data)

    if kf[1]<0.05:

        print('Churn by {} 的卡方临界值是{:.2f},小于0.05,表明{}组间有显著性差异,可进行【交叉分析】'.format(x,kf[1],x),'\n')

    else:

        print('Churn by {} 的卡方临界值是{:.2f},大于0.05,表明{}组间无显著性差异,不可进行交叉分析'.format(x,kf[1],x),'\n')

print('kf_var的卡方检验结果如下:','\n')

print(list(map(KF, kar_var)))

list=map(KF, kar_var)

#plot

#%% 预测流失

from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

from sklearn import metrics

X = num_data[['days_to_end',

'days_to_start',

'video_num',

'material_num',

'paper_num',

'pay_price',

'study_cnts',

'frequency',

'correct_rate',

'prev_frequency',

'month_frequency',

'growth_frequency']]

Y =num_data.days_not_study

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=33)

params=params={'learning_rate':np.linspace(0.01,0.10,5),

              'max_depth':[x for x in range(1,10,1)],

                'n_estimators':[x for x in range(200,500,100)]}

clf = XGBClassifier()

grid = GridSearchCV(clf, params, cv=3, scoring="accuracy")

grid.fit(X_train,Y_train)

grid.best_score_   

grid.best_params_ 

grid.best_estimator_

best_model=grid.best_estimator_

predict_y=best_model.predict(X_test)

metrics.f1_score(Y_test, predict_y)

metrics.accuracy_score(Y_test, predict_y)

predict_proba_y=best_model.predict_proba(X_test)

aa={'Y_test':Y_test,'predict_y':predict_proba_y}

bb=pd.DataFrame(aa)

bb.to_excel('predict.xlsx')

#prediction rate

list=[]

for i in predict_proba_y[:,0]:

    if i>=0.5:

        list.append(i)

    else:

        list.append(1-i)

list=pd.DataFrame(list)

list.to_excel('predict rate.xlsx')

weigh2=best_model.get_booster().get_score(importance_type="weight")

weigh2=pd.Series(weigh2).sort_values(ascending=False)

weigh2/weigh2.sum()

#%%

X=num_data[['15d_frequency',

'growth_frequency',

'30d_video',

'prev_video',

'month_live',

'month_video',

'video_num',

'pay_price',

'prev_correct',

'days_to_end',

'paper_num',

'correct_rate',

'study_cnts',

'month_correct',

'month_material',

'prev_live',

'prev_material',

'month_paper',

'days_to_start',

'30d_correct',

'appraise',

'material_num',

'live_num',

'prev_question']]

# X=num_data.drop(['target','days_not_study'],axis=1)

Y =num_data.days_not_study

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=33)

params=params={'learning_rate':np.linspace(0.01,0.10,5),

                'max_depth':[x for x in range(1,10,2)],

                'n_estimators':[x for x in range(100,300,20)]}

clf = XGBClassifier()

grid = GridSearchCV(clf, params, cv=3, scoring="accuracy")

grid.fit(X_train,Y_train)

grid.best_score_   

grid.best_params_ 

grid.best_estimator_

best_model=grid.best_estimator_

predict_y=best_model.predict(X_test)

smetrics.f1_score(Y_test, predict_y)

metrics.accuracy_score(Y_test, predict_y)

predict_proba_y=best_model.predict_proba(X_test)

#prediction rate

list=[]

for i in predict_proba_y[:,0]:

    if i>=0.5:

        list.append(i)

    else:

        list.append(1-i)

list=pd.DataFrame(list)

list.to_excel('predict rat2.xlsx')

weigh2=best_model.get_booster().get_score(importance_type="weight")

weigh2=pd.Series(weigh2).sort_values(ascending=False)

weigh2/weigh2.sum()

aa={'Y_test':Y_test,'predict_y':predict_y}

bb=pd.DataFrame(aa)

bb.to_excel('predict2.xlsx')

你可能感兴趣的:(XGB)