# -*- coding: utf-8 -*-
"""
Created on Sun Sep 16 19:04:53 2018
@author: wangxihe
"""
import os
import pandas as pd
import numbers
import numpy as np
import matplotlib.pyplot as plt
#%%
os.chdir(r'E:\spyderwork\评分卡模型\一特征构建')
allData = pd.read_csv('Idx0.csv',header = 0,encoding = 'gbk')
allData.shape
#%%
os.chdir(r'E:\spyderwork\评分卡模型\二特征清洗')
describeDf=allData.describe().T
def MissingCategorial(df,col):
missing_vals = df[col].map(lambda x: int(x!=x))
return sum(missing_vals)*1.0/df.shape[0]
def MissingContinuous(df,col):
missing_vals = df[col].map(lambda x: int(np.isnan(x)))
return sum(missing_vals) * 1.0 / df.shape[0]
#%%
allFeatures = list(allData.columns)
allFeatures.remove('target')
if 'Idx' in allFeatures:
allFeatures.remove('Idx')
allFeatures.remove('ListingInfo')
len(allFeatures)
#%%
#检查是否有常数型变量,并且检查是类别型还是数值型变量
numerical_var = []
for col in allFeatures:
if len(set(allData[col])) == 1:
print(' {} :此列为常数所以删除'.format(col))
del allData[col]
allFeatures.remove(col)
else:
uniq_valid_vals = [i for i in allData[col] if i == i]
uniq_valid_vals = list(set(uniq_valid_vals))
if len(uniq_valid_vals) >= 10 and isinstance(uniq_valid_vals[0], numbers.Real):
numerical_var.append(col)
categorical_var = [i for i in allFeatures if i not in numerical_var]
len(numerical_var)
len(categorical_var)
#%%
#检查变量的最多值的占比情况,以及每个变量中占比最大的值
records_count = allData.shape[0]
col_most_values,col_large_value = {},{}
for col in allFeatures:
value_count = allData[col].groupby(allData[col]).count()
col_most_values[col] = max(value_count)/records_count
large_value = value_count[value_count== max(value_count)].index[0]
col_large_value[col] = large_value
col_most_values_df = pd.DataFrame.from_dict(col_most_values, orient = 'index')
col_most_values_df.columns = ['max percent']
col_most_values_df = col_most_values_df.sort_values(by = 'max percent', ascending = False)
pcnt = list(col_most_values_df[:180]['max percent'])
vars = list(col_most_values_df[:180].index)
plt.bar(range(len(pcnt)), height = pcnt)
plt.title('Largest Percentage of Single Value in Each Variable')
len(col_most_values)
len(col_large_value)
#%%
#计算多数值产比超过90%的字段中,少数值的坏样本率是否会显著高于多数值
large_percent_cols = list(col_most_values_df[col_most_values_df['max percent']>=0.9].index)
bad_rate_diff = {}
for col in large_percent_cols:
large_value = col_large_value[col]
temp = allData[[col,'target']]
temp[col] = temp.apply(lambda x: int(x[col]==large_value),axis=1)
bad_rate = temp.groupby(col).mean()
if bad_rate.iloc[0]['target'] == 0:
bad_rate_diff[col] = 0
continue
bad_rate_diff[col] = np.log(bad_rate.iloc[0]['target']/bad_rate.iloc[1]['target'])
bad_rate_diff_sorted = sorted(bad_rate_diff.items(),key=lambda x: x[1], reverse=True)
bad_rate_diff_sorted_values = [x[1] for x in bad_rate_diff_sorted]
plt.bar(x = range(len(bad_rate_diff_sorted_values)), height = bad_rate_diff_sorted_values)
len(bad_rate_diff)
#%%
#由于所有的少数值的坏样本率并没有显著高于多数值,意味着这些变量可以直接剔除
for col in large_percent_cols:
if col in numerical_var:
numerical_var.remove(col)
else:
categorical_var.remove(col)
del allData[col]
#%%
missvalue={}
#对类别型变量,如果缺失超过80%, 就删除,否则当成特殊的状态
missing_pcnt_threshould_1 = 0.8
#%%
#for col in categorical_var:
# missingRate = MissingCategorial(allData,col)
# print('{0} has missing rate as {1}'.format(col,missingRate))
# if missingRate > missing_pcnt_threshould_1:
# categorical_var.remove(col)
# del allData[col]
# if 0 < missingRate < missing_pcnt_threshould_1:
# uniq_valid_vals = [i for i in allData[col] if i == i]
# uniq_valid_vals = list(set(uniq_valid_vals))
# if isinstance(uniq_valid_vals[0], numbers.Real):
# missing_position = allData.loc[allData[col] != allData[col]][col].index
# not_missing_sample = [-1]*len(missing_position)
# allData.loc[missing_position, col] = not_missing_sample
#
# else:
# allData[col] = allData[col].map(lambda x: str(x).upper())
#
#%%
for col in categorical_var:
missingRate = MissingCategorial(allData,col)
print('{0} has missing rate as {1}'.format(col,missingRate))
if missingRate > missing_pcnt_threshould_1:
categorical_var.remove(col)
del allData[col]
else:
temp=allData[col].value_counts()
max1=max(temp)
maxmiss=temp[temp==max1].index[0]
missvalue[col]=maxmiss
allData[col].fillna(maxmiss,inplace=True)
#sum(pd.isnull(allData['WeblogInfo_19']))
#%%检查数值型变量
#len(numerical_var)
#missing_pcnt_threshould_2 = 0.8
#deleted_var = []
#for col in numerical_var:
# missingRate = MissingContinuous(allData, col)
# print('{0} 该列缺失率为 {1}'.format(col, missingRate))
# if missingRate > missing_pcnt_threshould_2:
# deleted_var.append(col)
# print('将删除变量 {} 因为该变量缺失率高于设定的阈值'.format(col))
# else:
# if missingRate > 0:
# not_missing = allData.loc[allData[col] == allData[col]][col]
# #makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing)))
# missing_position = allData.loc[allData[col] != allData[col]][col].index
# #随机抽样补缺
# not_missing_sample = random.sample(list(not_missing), len(missing_position))
# allData.loc[missing_position,col] = not_missing_sample
# #del allData[col]
# #allData[col] = makeuped
# missingRate2 = MissingContinuous(allData, col)
# print('missing rate after making up is:{}'.format(str(missingRate2)))
#%%
len(numerical_var)
missing_pcnt_threshould_2 = 0.8
deleted_var = []
for col in numerical_var:
missingRate = MissingContinuous(allData, col)
print('{0} 该列缺失率为 {1}'.format(col, missingRate))
if missingRate > missing_pcnt_threshould_2:
deleted_var.append(col)
print('将删除变量 {} 因为该变量缺失率高于设定的阈值'.format(col))
else:
if missingRate > 0:
meanmiss=allData[col].mean()
missvalue[col]=round(meanmiss,6)
allData[col].fillna(round(meanmiss,6),inplace=True)
import pickle
with open('var_Fill.pkl',"wb") as f:
f.write(pickle.dumps(missingRate))
#%%
if deleted_var != []:
for col in deleted_var:
numerical_var.remove(col)
del allData[col]
#%%
allData.to_csv('Idx1.csv', header=True,encoding='gbk', columns = allData.columns, index=False)
#%%
#L0['Update_90_rate']=round(L0['Update_90_rate'],4)
#L0set=set(L0.columns)
#L1=pd.read_csv('allData_1.csv',encoding='gbk')
#L1set=set(L1.columns)#L1['Update_90_rate']=round(L1['Update_90_rate'],4)
#L0set==L1set
#
#L2=L0.append(L1)
##del L2['ListingInfo']
#L3=L2.drop_duplicates()