Python 数据处理

1.数据处理技巧

# 1.造数据
import pandas as pd
import numpy as np
data = pd.DataFrame({'user_id':[i for i in range(0, 10000)],
                     '性别': np.random.randint(0, 2, size = 10000), 
                     '年龄': [np.random.rand() for i in range(0, 10000)],
                     '收入': [np.random.rand() for i in range(0, 10000)],
                     '类别': np.random.randint(0,10, size = 10000)
                    })
# 2. 数据分箱
age_bins = [0, 0.25, 0.5, 0.75, np.max(list(data['年龄']))]
data['age_bins'] = pd.cut(data['年龄'], age_bins, labels=["(0,25]", "(25,50]", "(50,75]", "(75+]"])

# 3.数据透视表
data_pivot = data.pivot_table('收入', index=['age_bins'], columns=['性别'], aggfunc=np.median)
data_pivot.columns = ["0", "1"]
data_pivot = data_pivot.reset_index()

# 4.pandas按需要的顺序排序
data_pivot['age_bins'] = data_pivot['age_bins'].astype('category')
list_custom = ["(0,25]", "(50,75]", "(25,50]", "(75+]"]
data_pivot['age_bins'].cat.reorder_categories(list_custom, inplace=True) # recorder_categories生效
data_pivot.sort_values('age_bins', inplace=True)

2.数据处理函数

def get_bins_info(data, col, quantile=[0, 0.2, 0.4, 0.6, 0.8, 1]):
    """数据分箱
    @params data: DataFrame
    @params col: string data用于分箱的列
    @params quantile: list 获取col具体的百分位用于分箱
    return: Series,返回分箱的结果   
    """
    bins = data[col].quantile(q=quantile, interpolation='nearest')
    bins[0] = bins[0] - 1 
    labels = [i+1 for i in range(len(quantile)-1)]
    cut_lables = pd.cut(data[col], bins, labels=labels)
    return cut_lables

def face_value_rescale(data_value, perc_bin=[25, 75]):
    """function: 按百分位点对数据进行异常值处理,并利用minmax规范化到[0,1]区间
    @param data_value (list): 
    @param perc_bin (list): 处理数值的上下百分位区间
    return: scaled data
    """
    f_one = np.percentile(data_value, perc_bin[0])
    f_three = np.percentile(data_value, perc_bin[1])
    IQR = 1.5*(f_three - f_one)
    f_min = f_one - IQR
    f_max = f_three + IQR
    if f_min < 0:
        f_min = 0
    rescaled = map(lambda x: f_max if x>f_max else (f_min if x<f_min else x), data_value) 
    rescaled = map(lambda x: (x - f_min)/(f_max - f_min), list(rescaled)) 
    return list(rescaled)

def data_sample(data, target, threshold=5000):
    """数据欠采样,固定比例,负样本是正样本数量的三倍;并划分训练集测试集
    """
    dict_target = dict(data[target].value_counts())
    target_percent = dict([(key, round(value/sum(dict_target.values()),2)) for key, value in dict_target.items()])
    print('the num of each label before sampling is:\n', dict_target)
    print('the ratio of each label before sampling is:\n', target_percent)
    target_min = min(zip(dict_target.values(), dict_target.keys()))
    target_ratio_min = dict([(key, round(value/target_min[0],2)) for key, value in dict_target.items()])
    min_num = threshold if target_min[0]>threshold else target_min[0]
    target_num = dict([(key, round(min_num*value)) for key, value in target_ratio_min.items()])
   
    target_columns = [target for target in dict_target.keys() if target not in target_min]
    data_o = data.loc[data[target]==target_min[1], :][:min_num]
    for value in target_columns:
        frac = (len(data_o)/len(data))*target_ratio_min[value] if (len(data_o)/len(data))*target_ratio_min[value]<1 else 1
        data_s = data.loc[data[target]==value, :].sample(frac=round(target_num[value]/dict_target[value],2), random_state=2019)
        data_o = data_o.append(data_s)
    data_o = data_o.sample(frac=1, random_state=2019)
    print('the num of data after sampling is:', len(data_o))
    dict_target_s = dict(data_o[target].value_counts())
    target_percent_s = dict([(key, round(value/sum(dict_target_s.values()),2)) for key, value in dict_target_s.items()])
    print('the num of each label after sampling is:\n', dict_target_s)
    print('the ratio of each label after sampling is:\n', target_percent_s)

2.数据绘图

import seaborn as sns
import matplotlib.pyplot as plt  
plt.style.use('seaborn-darkgrid')
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
def plot_data_detail(data):
    """function: 原始数据细节展示,包括直方图、分布图、时间分布、回归关系图
    @param data(DataFrame): 
    
    """
    # 利率直方图
    sns.distplot(data['类别'], kde=True, rug=False)
    # 不同类别的分布
    plt.show()
    grid=sns.FacetGrid(data.dropna(how='any'), col='类别', hue='age_bins', margin_titles=True)
    grid.map(sns.distplot, '收入')
    grid.add_legend()
    plt.show()
    #####
    # 添加分布
    #####
    # 回归关系
    # data = data.loc[data.face_value<400,:]
    grid=sns.FacetGrid(data.dropna(how='any'), col='性别', hue='类别', margin_titles=True)
    grid.map(sns.regplot, "年龄", "收入")
    grid.add_legend()
    plt.show()
    #####
    #添加回归关系
    #####

你可能感兴趣的:(数据预处理,数据分析,python)