1.数据处理技巧
import pandas as pd
import numpy as np
data = pd.DataFrame({'user_id':[i for i in range(0, 10000)],
'性别': np.random.randint(0, 2, size = 10000),
'年龄': [np.random.rand() for i in range(0, 10000)],
'收入': [np.random.rand() for i in range(0, 10000)],
'类别': np.random.randint(0,10, size = 10000)
})
age_bins = [0, 0.25, 0.5, 0.75, np.max(list(data['年龄']))]
data['age_bins'] = pd.cut(data['年龄'], age_bins, labels=["(0,25]", "(25,50]", "(50,75]", "(75+]"])
data_pivot = data.pivot_table('收入', index=['age_bins'], columns=['性别'], aggfunc=np.median)
data_pivot.columns = ["0", "1"]
data_pivot = data_pivot.reset_index()
data_pivot['age_bins'] = data_pivot['age_bins'].astype('category')
list_custom = ["(0,25]", "(50,75]", "(25,50]", "(75+]"]
data_pivot['age_bins'].cat.reorder_categories(list_custom, inplace=True)
data_pivot.sort_values('age_bins', inplace=True)
2.数据处理函数
def get_bins_info(data, col, quantile=[0, 0.2, 0.4, 0.6, 0.8, 1]):
"""数据分箱
@params data: DataFrame
@params col: string data用于分箱的列
@params quantile: list 获取col具体的百分位用于分箱
return: Series,返回分箱的结果
"""
bins = data[col].quantile(q=quantile, interpolation='nearest')
bins[0] = bins[0] - 1
labels = [i+1 for i in range(len(quantile)-1)]
cut_lables = pd.cut(data[col], bins, labels=labels)
return cut_lables
def face_value_rescale(data_value, perc_bin=[25, 75]):
"""function: 按百分位点对数据进行异常值处理,并利用minmax规范化到[0,1]区间
@param data_value (list):
@param perc_bin (list): 处理数值的上下百分位区间
return: scaled data
"""
f_one = np.percentile(data_value, perc_bin[0])
f_three = np.percentile(data_value, perc_bin[1])
IQR = 1.5*(f_three - f_one)
f_min = f_one - IQR
f_max = f_three + IQR
if f_min < 0:
f_min = 0
rescaled = map(lambda x: f_max if x>f_max else (f_min if x<f_min else x), data_value)
rescaled = map(lambda x: (x - f_min)/(f_max - f_min), list(rescaled))
return list(rescaled)
def data_sample(data, target, threshold=5000):
"""数据欠采样,固定比例,负样本是正样本数量的三倍;并划分训练集测试集
"""
dict_target = dict(data[target].value_counts())
target_percent = dict([(key, round(value/sum(dict_target.values()),2)) for key, value in dict_target.items()])
print('the num of each label before sampling is:\n', dict_target)
print('the ratio of each label before sampling is:\n', target_percent)
target_min = min(zip(dict_target.values(), dict_target.keys()))
target_ratio_min = dict([(key, round(value/target_min[0],2)) for key, value in dict_target.items()])
min_num = threshold if target_min[0]>threshold else target_min[0]
target_num = dict([(key, round(min_num*value)) for key, value in target_ratio_min.items()])
target_columns = [target for target in dict_target.keys() if target not in target_min]
data_o = data.loc[data[target]==target_min[1], :][:min_num]
for value in target_columns:
frac = (len(data_o)/len(data))*target_ratio_min[value] if (len(data_o)/len(data))*target_ratio_min[value]<1 else 1
data_s = data.loc[data[target]==value, :].sample(frac=round(target_num[value]/dict_target[value],2), random_state=2019)
data_o = data_o.append(data_s)
data_o = data_o.sample(frac=1, random_state=2019)
print('the num of data after sampling is:', len(data_o))
dict_target_s = dict(data_o[target].value_counts())
target_percent_s = dict([(key, round(value/sum(dict_target_s.values()),2)) for key, value in dict_target_s.items()])
print('the num of each label after sampling is:\n', dict_target_s)
print('the ratio of each label after sampling is:\n', target_percent_s)
2.数据绘图
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
plt.rcParams['font.sans-serif']=['SimHei']
def plot_data_detail(data):
"""function: 原始数据细节展示,包括直方图、分布图、时间分布、回归关系图
@param data(DataFrame):
"""
sns.distplot(data['类别'], kde=True, rug=False)
plt.show()
grid=sns.FacetGrid(data.dropna(how='any'), col='类别', hue='age_bins', margin_titles=True)
grid.map(sns.distplot, '收入')
grid.add_legend()
plt.show()
grid=sns.FacetGrid(data.dropna(how='any'), col='性别', hue='类别', margin_titles=True)
grid.map(sns.regplot, "年龄", "收入")
grid.add_legend()
plt.show()