# -*- coding: utf-8 -*-
"""
Created on Sun Sep 16 09:24:18 2018
@author: wangxihe
"""
import os
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import collections
import numpy as np
os.chdir(r'E:\spyderwork\评分卡模型\一特征构建')
plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
#%%读取数据
MasterData=pd.read_csv('PPD_Training_Master_GBK_3_1_Training_Set.csv',encoding='gbk')
LoginData=pd.read_csv('PPD_LogInfo_3_1_Training_Set.csv',encoding='gbk')
UpdateData=pd.read_csv('PPD_Userupdate_Info_3_1_Training_Set.csv',encoding='gbk')
#%%处理时间格式
#LoginData['Listinginfo1']=pd.to_datetime(LoginData['Listinginfo1'])
LoginData['Listinginfo1']=LoginData['Listinginfo1'].apply(lambda x :datetime.datetime.strptime(x,'%Y-%m-%d'))
LoginData['LogInfo3']=LoginData['LogInfo3'].apply(lambda x :datetime.datetime.strptime(x,'%Y-%m-%d'))
#计算登录天数
LoginData['LogDay']=LoginData['Listinginfo1']-LoginData['LogInfo3']
#LoginData['LogDay']=LoginData[['Listinginfo1','LogInfo3']].apply(lambda x:x[0]-x[1])
LoginData['LogDay']=LoginData['LogDay'].dt.days
#%%
#查看登录天数分布
LoginData['LogDay'].plot(kind='hist',bins=200)
#%%#查看登录天数分布
plt.hist(LoginData['LogDay'],bins=300)
plt.title('登录天数分布')
#%%登录方式
LoginData['LogInfo2'].value_counts()
LoginData['LogInfo2'].value_counts().sort_values().plot(kind='barh')
#%%
def MyDiv(x,y):
if y==None:
return 0
elif y==0:
return 0
else:
return x*1.0/y
#%%
Tw=[7, 30, 60, 90, 120, 150, 180]
cols=['LogInfo1','LogInfo2']
LoginIdxDf=pd.DataFrame({'Idx':LoginData['Idx'].drop_duplicates()})
for day in Tw:
LoginData['LogTime']=LoginData['Listinginfo1']-datetime.timedelta(days=day)
TempDf=LoginData[LoginData['LogInfo3']>=LoginData['LogTime']]
for var in cols:
#总的登录次数
TempGroupDict=TempDf.groupby('Idx')[var].count().to_dict()
LoginIdxDf[str(var)+'_'+str(day)+'_totalnum']=LoginIdxDf['Idx'].apply(lambda x:TempGroupDict.get(x,0))
#不重复的登录次数
UnionTempDf=TempDf[['Idx',var]].drop_duplicates()
UnionTempDict=UnionTempDf.groupby('Idx')[var].count().to_dict()
LoginIdxDf[str(var) + '_' + str(day) + '_unique']=LoginIdxDf['Idx'].apply(lambda x:UnionTempDict.get(x,0))
#比例
LoginIdxDf[str(var) + '_' + str(day) + '_rate']=LoginIdxDf[[str(var)+'_'+str(day)+'_totalnum',str(var) + '_' + str(day) + '_unique']].apply(lambda x:MyDiv(x[0],x[1]),axis=1)
LoginIdxDf.to_csv('Log.csv')
#%%
#UpdateData['ListingInfo1']=pd.to_datetime(UpdateData['ListingInfo1'])
#UpdateData['UserupdateInfo2']=pd.to_datetime(UpdateData['UserupdateInfo2'])
UpdateData['ListingInfo1']=UpdateData['ListingInfo1'].apply(lambda x :datetime.datetime.strptime(x,'%Y/%m/%d'))
UpdateData['UserupdateInfo2']=UpdateData['UserupdateInfo2'].apply(lambda x :datetime.datetime.strptime(x,'%Y/%m/%d'))
#%%
UpdateData['UserupdateInfo1'].value_counts().sort_values(ascending=False)
len(UpdateData['UserupdateInfo1'].value_counts())
updateTop10=UpdateData['UserupdateInfo1'].value_counts().sort_values(ascending=False).head(20).copy()
updateTop10.sort_values().plot(kind='barh')
#%%转为大写,看是否有重复
UpdateData['UserupdateInfo1']=UpdateData['UserupdateInfo1'].apply(lambda x:x.upper())
len(UpdateData['UserupdateInfo1'].value_counts())
def updateNumber(x):
if x=='_MOBILEPHONE':
return '_PHONE'
else:
return x
UpdateData['UserupdateInfo1']=UpdateData['UserupdateInfo1'].apply(lambda x:updateNumber(x))
#%%
UpdateIdxDf=pd.DataFrame({'Idx':UpdateData['Idx'].drop_duplicates()})
for day in Tw:
UpdateData['LogTime']=UpdateData['ListingInfo1']-datetime.timedelta(days=day)
TempDf=UpdateData[UpdateData['UserupdateInfo2']>=UpdateData['LogTime']]
TempGroupDict=TempDf.groupby('Idx')['UserupdateInfo1'].count().to_dict()
UpdateIdxDf['Update_'+str(day)+'_freq']=UpdateIdxDf['Idx'].apply(lambda x:TempGroupDict.get(x,0))
UnionTempDf=TempDf[['Idx','UserupdateInfo1']].drop_duplicates()
UnionTempDict=UnionTempDf.groupby('Idx')['UserupdateInfo1'].count().to_dict()
UpdateIdxDf['Update_' + str(day) + '_unique']=UpdateIdxDf['Idx'].apply(lambda x:UnionTempDict.get(x,0))
UpdateIdxDf['Update_' + str(day) + '_rate']=UpdateIdxDf[['Update_'+str(day)+'_freq','Update_' + str(day) + '_unique']].apply(lambda x:MyDiv(x[0],x[1]),axis=1)
#修改重要属性
TempsumDict=UnionTempDf.groupby('Idx')['UserupdateInfo1'].sum()
for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']:
item_dict = TempsumDict.map(lambda x: int(item in x)).to_dict()
UpdateIdxDf['UserupdateInfo_' + str(day) + str(item)]=UpdateIdxDf['Idx'].apply(lambda x:item_dict.get(x,0))
UpdateIdxDf.to_csv('update.csv')
#%%判断归属地是否一致
MasterData['city_match'] = MasterData.apply(lambda x: int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis = 1)
del MasterData['UserInfo_2']
del MasterData['UserInfo_4']
del MasterData['UserInfo_8']
del MasterData['UserInfo_20']
MasterData.to_csv('master.csv',encoding = 'gbk')
#%%
allData_0=pd.concat([MasterData.set_index('Idx'),UpdateIdxDf.set_index('Idx'),LoginIdxDf.set_index('Idx')],axis=1)
allData_0.to_csv('Idx0.csv',encoding='gbk')
#%%
LoginData['MinueDays']=LoginData[['Listinginfo1','LogInfo3']].apply(lambda x:(x[0]-x[1]).days,axis=1)
def TimeWindowSelection(df,col,tw):
tw_dict={}
for day in tw:
tw_dict[day]=len(df[df[col]<=day])
return tw_dict
tw_dict=TimeWindowSelection(LoginData,'MinueDays',[7,15,30,60,90,120,150,180])
tw_df=pd.DataFrame.from_dict(tw_dict,orient ='index')
tw_df.plot(kind='bar')
#%%
UpdateData['MinueDays']=UpdateData[['UserupdateInfo2','ListingInfo1']].apply(lambda x:(x[1]-x[0]).days,axis=1)
t=collections.Counter(UpdateData['MinueDays'])
hist_ListingGap = np.histogram(UpdateData['MinueDays'])
hist_ListingGap = pd.DataFrame({'Freq':hist_ListingGap[0],'gap':hist_ListingGap[1][1:]})
hist_ListingGap['CumFreq'] = hist_ListingGap['Freq'].cumsum()
hist_ListingGap['CumPercent'] = hist_ListingGap['CumFreq'].map(lambda x: x*1.0/hist_ListingGap.iloc[-1]['CumFreq'])
#%%
#groupby collections.Counter np.histogram concat merger