评分卡模型-(一特征构建)

# -*- coding: utf-8 -*-
"""
Created on Sun Sep 16 09:24:18 2018

@author: wangxihe
"""

import os
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import collections
import numpy as np
os.chdir(r'E:\spyderwork\评分卡模型\一特征构建')

plt.rcParams['font.sans-serif']=['SimHei']  # 用来正常显示中文标签 
plt.rcParams['axes.unicode_minus']=False  # 用来正常显示负号
#%%读取数据
MasterData=pd.read_csv('PPD_Training_Master_GBK_3_1_Training_Set.csv',encoding='gbk')
LoginData=pd.read_csv('PPD_LogInfo_3_1_Training_Set.csv',encoding='gbk')
UpdateData=pd.read_csv('PPD_Userupdate_Info_3_1_Training_Set.csv',encoding='gbk')
#%%处理时间格式
#LoginData['Listinginfo1']=pd.to_datetime(LoginData['Listinginfo1'])
LoginData['Listinginfo1']=LoginData['Listinginfo1'].apply(lambda x :datetime.datetime.strptime(x,'%Y-%m-%d'))
LoginData['LogInfo3']=LoginData['LogInfo3'].apply(lambda x :datetime.datetime.strptime(x,'%Y-%m-%d'))

#计算登录天数
LoginData['LogDay']=LoginData['Listinginfo1']-LoginData['LogInfo3']
#LoginData['LogDay']=LoginData[['Listinginfo1','LogInfo3']].apply(lambda x:x[0]-x[1])
LoginData['LogDay']=LoginData['LogDay'].dt.days
#%%
#查看登录天数分布
LoginData['LogDay'].plot(kind='hist',bins=200)

#%%#查看登录天数分布
plt.hist(LoginData['LogDay'],bins=300)
plt.title('登录天数分布')
#%%登录方式

LoginData['LogInfo2'].value_counts()
LoginData['LogInfo2'].value_counts().sort_values().plot(kind='barh')
#%%
def MyDiv(x,y):
    if y==None:
        return 0
    elif y==0:
        return 0
    else:
        return x*1.0/y
   
#%%
Tw=[7, 30, 60, 90, 120, 150, 180]
cols=['LogInfo1','LogInfo2']

LoginIdxDf=pd.DataFrame({'Idx':LoginData['Idx'].drop_duplicates()})

for day in Tw:
    LoginData['LogTime']=LoginData['Listinginfo1']-datetime.timedelta(days=day)
    TempDf=LoginData[LoginData['LogInfo3']>=LoginData['LogTime']]
    for var in cols:
        #总的登录次数
        TempGroupDict=TempDf.groupby('Idx')[var].count().to_dict()
        LoginIdxDf[str(var)+'_'+str(day)+'_totalnum']=LoginIdxDf['Idx'].apply(lambda x:TempGroupDict.get(x,0))
        #不重复的登录次数
        UnionTempDf=TempDf[['Idx',var]].drop_duplicates()
        UnionTempDict=UnionTempDf.groupby('Idx')[var].count().to_dict()
        LoginIdxDf[str(var) + '_' + str(day) + '_unique']=LoginIdxDf['Idx'].apply(lambda x:UnionTempDict.get(x,0))
        #比例
        LoginIdxDf[str(var) + '_' + str(day) + '_rate']=LoginIdxDf[[str(var)+'_'+str(day)+'_totalnum',str(var) + '_' + str(day) + '_unique']].apply(lambda x:MyDiv(x[0],x[1]),axis=1)
       
LoginIdxDf.to_csv('Log.csv')
#%% 
#UpdateData['ListingInfo1']=pd.to_datetime(UpdateData['ListingInfo1'])
#UpdateData['UserupdateInfo2']=pd.to_datetime(UpdateData['UserupdateInfo2'])
UpdateData['ListingInfo1']=UpdateData['ListingInfo1'].apply(lambda x :datetime.datetime.strptime(x,'%Y/%m/%d'))
UpdateData['UserupdateInfo2']=UpdateData['UserupdateInfo2'].apply(lambda x :datetime.datetime.strptime(x,'%Y/%m/%d'))
#%%
UpdateData['UserupdateInfo1'].value_counts().sort_values(ascending=False)
len(UpdateData['UserupdateInfo1'].value_counts())
updateTop10=UpdateData['UserupdateInfo1'].value_counts().sort_values(ascending=False).head(20).copy()
updateTop10.sort_values().plot(kind='barh')
#%%转为大写,看是否有重复
UpdateData['UserupdateInfo1']=UpdateData['UserupdateInfo1'].apply(lambda x:x.upper())
len(UpdateData['UserupdateInfo1'].value_counts())

def updateNumber(x):
    if x=='_MOBILEPHONE':
        return '_PHONE'
    else:
        return x
   
UpdateData['UserupdateInfo1']=UpdateData['UserupdateInfo1'].apply(lambda x:updateNumber(x))
#%%
UpdateIdxDf=pd.DataFrame({'Idx':UpdateData['Idx'].drop_duplicates()})
      
for day in Tw:
    UpdateData['LogTime']=UpdateData['ListingInfo1']-datetime.timedelta(days=day)
    TempDf=UpdateData[UpdateData['UserupdateInfo2']>=UpdateData['LogTime']]
    TempGroupDict=TempDf.groupby('Idx')['UserupdateInfo1'].count().to_dict()
    UpdateIdxDf['Update_'+str(day)+'_freq']=UpdateIdxDf['Idx'].apply(lambda x:TempGroupDict.get(x,0))
    UnionTempDf=TempDf[['Idx','UserupdateInfo1']].drop_duplicates()
    UnionTempDict=UnionTempDf.groupby('Idx')['UserupdateInfo1'].count().to_dict()
    UpdateIdxDf['Update_' + str(day) + '_unique']=UpdateIdxDf['Idx'].apply(lambda x:UnionTempDict.get(x,0))
    UpdateIdxDf['Update_' + str(day) + '_rate']=UpdateIdxDf[['Update_'+str(day)+'_freq','Update_' + str(day) + '_unique']].apply(lambda x:MyDiv(x[0],x[1]),axis=1)
    #修改重要属性

    TempsumDict=UnionTempDf.groupby('Idx')['UserupdateInfo1'].sum()
    for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']:
       item_dict = TempsumDict.map(lambda x: int(item in x)).to_dict()
       UpdateIdxDf['UserupdateInfo_' + str(day) + str(item)]=UpdateIdxDf['Idx'].apply(lambda x:item_dict.get(x,0))   
      
UpdateIdxDf.to_csv('update.csv')
#%%判断归属地是否一致
MasterData['city_match'] = MasterData.apply(lambda x: int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis = 1)
del MasterData['UserInfo_2']
del MasterData['UserInfo_4']
del MasterData['UserInfo_8']
del MasterData['UserInfo_20']

MasterData.to_csv('master.csv',encoding = 'gbk')
#%%

allData_0=pd.concat([MasterData.set_index('Idx'),UpdateIdxDf.set_index('Idx'),LoginIdxDf.set_index('Idx')],axis=1)
allData_0.to_csv('Idx0.csv',encoding='gbk')


#%%
LoginData['MinueDays']=LoginData[['Listinginfo1','LogInfo3']].apply(lambda x:(x[0]-x[1]).days,axis=1)

def TimeWindowSelection(df,col,tw):
    tw_dict={}
    for day in tw:
        tw_dict[day]=len(df[df[col]<=day])
    return tw_dict
tw_dict=TimeWindowSelection(LoginData,'MinueDays',[7,15,30,60,90,120,150,180])
tw_df=pd.DataFrame.from_dict(tw_dict,orient ='index')

tw_df.plot(kind='bar')

#%%

UpdateData['MinueDays']=UpdateData[['UserupdateInfo2','ListingInfo1']].apply(lambda x:(x[1]-x[0]).days,axis=1)
t=collections.Counter(UpdateData['MinueDays'])
hist_ListingGap = np.histogram(UpdateData['MinueDays'])
hist_ListingGap = pd.DataFrame({'Freq':hist_ListingGap[0],'gap':hist_ListingGap[1][1:]})
hist_ListingGap['CumFreq'] = hist_ListingGap['Freq'].cumsum()
hist_ListingGap['CumPercent'] = hist_ListingGap['CumFreq'].map(lambda x: x*1.0/hist_ListingGap.iloc[-1]['CumFreq'])

#%%

#groupby collections.Counter np.histogram concat merger

你可能感兴趣的:(Python)