评分卡模型(二数据清洗)

# -*- coding: utf-8 -*-
"""
Created on Sun Sep 16 19:04:53 2018

@author: wangxihe
"""

import os
import pandas as pd
import numbers
import numpy as np
import matplotlib.pyplot as plt
#%%
os.chdir(r'E:\spyderwork\评分卡模型\一特征构建')
allData = pd.read_csv('Idx0.csv',header = 0,encoding = 'gbk')
allData.shape
#%%
os.chdir(r'E:\spyderwork\评分卡模型\二特征清洗')
describeDf=allData.describe().T

def MissingCategorial(df,col):
    missing_vals = df[col].map(lambda x: int(x!=x))
    return sum(missing_vals)*1.0/df.shape[0]

def MissingContinuous(df,col):
    missing_vals = df[col].map(lambda x: int(np.isnan(x)))
    return sum(missing_vals) * 1.0 / df.shape[0]


#%%
allFeatures = list(allData.columns)
allFeatures.remove('target')
if 'Idx' in allFeatures:
    allFeatures.remove('Idx')
allFeatures.remove('ListingInfo')

len(allFeatures)
#%%
#检查是否有常数型变量,并且检查是类别型还是数值型变量

numerical_var = []
for col in allFeatures:
    if len(set(allData[col])) == 1:
        print(' {} :此列为常数所以删除'.format(col))
        del allData[col]
        allFeatures.remove(col)
    else:
        uniq_valid_vals = [i for i in allData[col] if i == i]
       
        uniq_valid_vals = list(set(uniq_valid_vals))
        if len(uniq_valid_vals) >= 10 and isinstance(uniq_valid_vals[0], numbers.Real):
            numerical_var.append(col)

categorical_var = [i for i in allFeatures if i not in numerical_var]

len(numerical_var)
len(categorical_var)
#%%
#检查变量的最多值的占比情况,以及每个变量中占比最大的值
records_count = allData.shape[0]
col_most_values,col_large_value = {},{}
for col in allFeatures:
    value_count = allData[col].groupby(allData[col]).count()
    col_most_values[col] = max(value_count)/records_count
    large_value = value_count[value_count== max(value_count)].index[0]
    col_large_value[col] = large_value
col_most_values_df = pd.DataFrame.from_dict(col_most_values, orient = 'index')
col_most_values_df.columns = ['max percent']
col_most_values_df = col_most_values_df.sort_values(by = 'max percent', ascending = False)
pcnt = list(col_most_values_df[:180]['max percent'])
vars = list(col_most_values_df[:180].index)
plt.bar(range(len(pcnt)), height = pcnt)
plt.title('Largest Percentage of Single Value in Each Variable')
len(col_most_values)
len(col_large_value)
#%%
#计算多数值产比超过90%的字段中,少数值的坏样本率是否会显著高于多数值
large_percent_cols = list(col_most_values_df[col_most_values_df['max percent']>=0.9].index)
bad_rate_diff = {}
for col in large_percent_cols:
    large_value = col_large_value[col]
    temp = allData[[col,'target']]
    temp[col] = temp.apply(lambda x: int(x[col]==large_value),axis=1)
    bad_rate = temp.groupby(col).mean()
    if bad_rate.iloc[0]['target'] == 0:
        bad_rate_diff[col] = 0
        continue
    bad_rate_diff[col] = np.log(bad_rate.iloc[0]['target']/bad_rate.iloc[1]['target'])
bad_rate_diff_sorted = sorted(bad_rate_diff.items(),key=lambda x: x[1], reverse=True)
bad_rate_diff_sorted_values = [x[1] for x in bad_rate_diff_sorted]
plt.bar(x = range(len(bad_rate_diff_sorted_values)), height = bad_rate_diff_sorted_values)
len(bad_rate_diff)
#%%
#由于所有的少数值的坏样本率并没有显著高于多数值,意味着这些变量可以直接剔除
for col in large_percent_cols:
    if col in numerical_var:
        numerical_var.remove(col)
    else:
        categorical_var.remove(col)
    del allData[col]
#%%
   
missvalue={}
   
#对类别型变量,如果缺失超过80%, 就删除,否则当成特殊的状态
missing_pcnt_threshould_1 = 0.8
#%%
#for col in categorical_var:
#    missingRate = MissingCategorial(allData,col)
#    print('{0} has missing rate as {1}'.format(col,missingRate))
#    if missingRate > missing_pcnt_threshould_1:
#        categorical_var.remove(col)
#        del allData[col]
#    if 0 < missingRate < missing_pcnt_threshould_1:
#        uniq_valid_vals = [i for i in allData[col] if i == i]
#        uniq_valid_vals = list(set(uniq_valid_vals))
#        if isinstance(uniq_valid_vals[0], numbers.Real):
#            missing_position = allData.loc[allData[col] != allData[col]][col].index
#            not_missing_sample = [-1]*len(missing_position)
#            allData.loc[missing_position, col] = not_missing_sample
#           
#        else:
#            allData[col] = allData[col].map(lambda x: str(x).upper())
#          
#%%
for col in categorical_var:
    missingRate = MissingCategorial(allData,col)
    print('{0} has missing rate as {1}'.format(col,missingRate))
    if missingRate > missing_pcnt_threshould_1:
        categorical_var.remove(col)
        del allData[col]
    else:
        temp=allData[col].value_counts()    
        max1=max(temp)
        maxmiss=temp[temp==max1].index[0]
        missvalue[col]=maxmiss
        allData[col].fillna(maxmiss,inplace=True)
           


#sum(pd.isnull(allData['WeblogInfo_19'])) 
     
#%%检查数值型变量

#len(numerical_var)
#missing_pcnt_threshould_2 = 0.8
#deleted_var = []
#for col in numerical_var:
#    missingRate = MissingContinuous(allData, col)
#    print('{0} 该列缺失率为 {1}'.format(col, missingRate))
#    if missingRate > missing_pcnt_threshould_2:
#        deleted_var.append(col)
#        print('将删除变量 {} 因为该变量缺失率高于设定的阈值'.format(col))
#    else:
#        if missingRate > 0:
#            not_missing = allData.loc[allData[col] == allData[col]][col]
#            #makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing)))
#            missing_position = allData.loc[allData[col] != allData[col]][col].index
#            #随机抽样补缺
#            not_missing_sample = random.sample(list(not_missing), len(missing_position))
#            allData.loc[missing_position,col] = not_missing_sample
#            #del allData[col]
#            #allData[col] = makeuped
#            missingRate2 = MissingContinuous(allData, col)
#            print('missing rate after making up is:{}'.format(str(missingRate2)))
#%%
len(numerical_var)
missing_pcnt_threshould_2 = 0.8
deleted_var = []
for col in numerical_var:
    missingRate = MissingContinuous(allData, col)
    print('{0} 该列缺失率为 {1}'.format(col, missingRate))
    if missingRate > missing_pcnt_threshould_2:
        deleted_var.append(col)
        print('将删除变量 {} 因为该变量缺失率高于设定的阈值'.format(col))
    else:
        if missingRate > 0:
           meanmiss=allData[col].mean()
           missvalue[col]=round(meanmiss,6)
           allData[col].fillna(round(meanmiss,6),inplace=True)
          
import pickle

with open('var_Fill.pkl',"wb") as f:
    f.write(pickle.dumps(missingRate))

#%%
if deleted_var != []:
    for col in deleted_var:
        numerical_var.remove(col)
        del allData[col]

#%%
allData.to_csv('Idx1.csv', header=True,encoding='gbk', columns = allData.columns, index=False)


#%%
#L0['Update_90_rate']=round(L0['Update_90_rate'],4)
#L0set=set(L0.columns)
#L1=pd.read_csv('allData_1.csv',encoding='gbk')
#L1set=set(L1.columns)#L1['Update_90_rate']=round(L1['Update_90_rate'],4)
#L0set==L1set
#
#L2=L0.append(L1)
##del L2['ListingInfo']
#L3=L2.drop_duplicates()

 

你可能感兴趣的:(Python)