作为一个算法工程师,我们接的业务需求不会比数据分析挖掘工程师少,作为一个爱偷懒的人,总机械重复的完成一样的预处理工作,我是不能忍的,所以在最近几天,我正在完善一些常规的、通用的预处理的code,方便我们以后在每次分析之前直接import快速搞定,省的每次都要去做一样的事情。
如果大家有什么想实现但是懒得去弄的预处理的步骤也可以私信我,我相对而言闲暇还是有的(毕竟工资少工作也不多,摊手:《),我开发完成后直接贴出来,大家以后一起用就行了
我们需要预加载这些包,而且接下来所有的操作均在dataframe格式下完成,所以我们需要将数据先处理成dataframe格式
from __future__ import division
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import NearestNeighbors
__author__ = 'slade_sal'
__time__ = '20171128'
def change_data_format(data):
# 以下预处理都是基于dataframe格式进行的
data_new = pd.DataFrame(data)
return data_new
接下来就开始我们的正题了,首先,我们需要判断哪些列是空值过多的,当一列数据的空值占列数的40%以上(经验值),这列能够带给我们的信息就不多了,所以我们需要把某个阀值(rate_base)以上的空值个数的列干掉,如下:
# 去除空值过多的feature
def nan_remove(data, rate_base=0.4):
all_cnt = data.shape[0]
avaiable_index = []
# 针对每一列feature统计nan的个数,个数大于全量样本的rate_base的认为是异常feature,进行剔除
for i in range(data.shape[1]):
rate = np.isnan(np.array(data.iloc[:, i])).sum() / all_cnt
if rate <= rate_base:
avaiable_index.append(i)
data_available = data.iloc[:, avaiable_index]
return data_available, avaiable_index
把空值过多的列去完之后,我们需要考虑将一些特别离群的点去掉,这边需要注意两点:
- 异常值分析类的场景禁止使用这步,比如信用卡评分,爬虫识别等,你如果采取了这步,还怎么去分离出这些异常啊
- 容忍度高的算法不建议使用这步,比如svm里面已经有了支持向量机这个东西,你如果采取了这步的离群识别的操作会改变原分布而且svm里面决定超平面的核心与离群点无关,后接函数会引发意想不到的彩蛋~
这边采取盖帽法与额定的分位点方法,建议组合使用,用changed_feature_box定义需要采用盖帽法的列的index_num,代码如下:
# 离群点盖帽
def outlier_remove(data, limit_value=10, method='box', percentile_limit_set=90, changed_feature_box=[]):
# limit_value是最小处理样本个数set,当独立样本大于limit_value我们认为非可onehot字段
feature_cnt = data.shape[1]
feature_change = []
if method == 'box':
for i in range(feature_cnt):
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) >= limit_value:
q1 = np.percentile(np.array(data.iloc[:, i]), 25)
q3 = np.percentile(np.array(data.iloc[:, i]), 75)
# q3+3/2*qi为上截距点,详细百度分箱图
top = q3 + 1.5 * (q3 - q1)
data.iloc[:, i][data.iloc[:, i] > top] = top
feature_change.append(i)
return data, feature_change
if method == 'self_def':
# 快速截断
if len(changed_feature_box) == 0:
# 当方法选择为自定义,且没有定义changed_feature_box则全量数据全部按照percentile_limit_set的分位点大小进行截断
for i in range(feature_cnt):
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) >= limit_value:
q_limit = np.percentile(np.array(data.iloc[:, i]), percentile_limit_set)
data.iloc[:, i][data.iloc[:, i] > q_limit] = q_limit
feature_change.append(i)
else:
# 如果定义了changed_feature_box,则将changed_feature_box里面的按照box方法,changed_feature_box的feature index按照percentile_limit_set的分位点大小进行截断
for i in range(feature_cnt):
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) >= limit_value:
if i in changed_feature_box:
q1 = np.percentile(np.array(data.iloc[:, i]), 25)
q3 = np.percentile(np.array(data.iloc[:, i]), 75)
# q3+3/2*qi为上截距点,详细百度分箱图
top = q3 + 1.5 * (q3 - q1)
data.iloc[:, i][data.iloc[:, i] > top] = top
feature_change.append(i)
else:
q_limit = np.percentile(np.array(data.iloc[:, i]), percentile_limit_set)
data.iloc[:, i][data.iloc[:, i] > q_limit] = q_limit
feature_change.append(i)
return data, feature_change
在此之后,我们需要对空值进行填充,这边方法就很多很多了,我这边实现的是基本的,分了连续feature和分类feature,分别针对continuous feature采取mean,min,max方式,class feature采取one_hot_encoding的方式;除此之外还可以做分层填充,差分填充等等,那个比较定制化,如果有需要,我也可以搞一套,但是个人觉得意义不大。
# 空feature填充
def nan_fill(data, limit_value=10, countinuous_dealed_method='mean'):
feature_cnt = data.shape[1]
normal_index = []
continuous_feature_index = []
class_feature_index = []
continuous_feature_df = pd.DataFrame()
class_feature_df = pd.DataFrame()
# 当存在空值且每个feature下独立的样本数小于limit_value,我们认为是class feature采取one_hot_encoding;
# 当存在空值且每个feature下独立的样本数大于limit_value,我们认为是continuous feature采取mean,min,max方式
for i in range(feature_cnt):
if np.isnan(np.array(data.iloc[:, i])).sum() > 0:
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) >= limit_value:
if countinuous_dealed_method == 'mean':
continuous_feature_df = pd.concat(
[continuous_feature_df, data.iloc[:, i].fillna(data.iloc[:, i].mean())], axis=1)
continuous_feature_index.append(i)
elif countinuous_dealed_method == 'max':
continuous_feature_df = pd.concat(
[continuous_feature_df, data.iloc[:, i].fillna(data.iloc[:, i].max())], axis=1)
continuous_feature_index.append(i)
elif countinuous_dealed_method == 'min':
continuous_feature_df = pd.concat(
[continuous_feature_df, data.iloc[:, i].fillna(data.iloc[:, i].min())], axis=1)
continuous_feature_index.append(i)
elif len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) > 0 and len(
pd.DataFrame(data.iloc[:, i]).drop_duplicates()) < limit_value:
class_feature_df = pd.concat(
[class_feature_df, pd.get_dummies(data.iloc[:, i], prefix=data.columns[i])], axis=1)
class_feature_index.append(i)
else:
normal_index.append(i)
data_update = pd.concat([data.iloc[:, normal_index], continuous_feature_df, class_feature_df], axis=1)
return data_update
分类feature的one hot encoding过程,常见操作,不多说
# onehotencoding
def ohe(data, limit_value=10):
feature_cnt = data.shape[1]
class_index = []
class_df = pd.DataFrame()
normal_index = []
# limit_value以下的均认为是class feature,进行ohe过程
for i in range(feature_cnt):
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) < limit_value:
class_index.append(i)
class_df = pd.concat([class_df, pd.get_dummies(data.iloc[:, i], prefix=data.columns[i])], axis=1)
else:
normal_index.append(i)
data_update = pd.concat([data.iloc[:, normal_index], class_df], axis=1)
return data_update
正负样本不平衡的解决,这边我写的是smote,理论部分建议参考:Python:SMOTE算法,其实简单的欠抽样和过抽样就可以解决,建议参考这边文章:Python:数据抽样平衡方法重写。都是一些老生常谈的问题了,不多说了,上代码:
# smote unbalance dataset
def smote(data, tag_label='tag_1', amount_personal=0, std_rate=5, k=5,method = 'mean'):
cnt = data[tag_label].groupby(data[tag_label]).count()
rate = max(cnt) / min(cnt)
location = []
if rate < 5:
print('不需要smote过程')
return data
else:
# 拆分不同大小的数据集合
less_data = np.array(data[data[tag_label] == np.array(cnt[cnt == min(cnt)].index)[0]])
more_data = np.array(data[data[tag_label] == np.array(cnt[cnt == max(cnt)].index)[0]])
# 找出每个少量数据中每条数据k个邻居
neighbors = NearestNeighbors(n_neighbors=k).fit(less_data)
for i in range(len(less_data)):
point = less_data[i, :]
location_set = neighbors.kneighbors([less_data[i]], return_distance=False)[0]
location.append(location_set)
# 确定需要将少量数据补充到上限额度
# 判断有没有设定生成数据个数,如果没有按照std_rate(预期正负样本比)比例生成
if amount_personal > 0:
amount = amount_personal
else:
amount = int(max(cnt) / std_rate)
# 初始化,判断连续还是分类变量采取不同的生成逻辑
times = 0
continue_index = [] # 连续变量
class_index = [] # 分类变量
for i in range(less_data.shape[1]):
if len(pd.DataFrame(less_data[:, i]).drop_duplicates()) > 10:
continue_index.append(i)
else:
class_index.append(i)
case_update = pd.DataFrame()
while times < amount:
# 连续变量取附近k个点的重心,认为少数样本的附近也是少数样本
new_case = []
pool = np.random.permutation(len(location))[0]
neighbor_group = less_data[location[pool], :]
if method == 'mean':
new_case1 = neighbor_group[:, continue_index].mean(axis=0)
# 连续样本的附近点向量上的点也是异常点
if method =='random':
new_case1 =less_data[pool][continue_index] + np.random.rand()*(less_data[pool][continue_index]-neighbor_group[0][continue_index])
# 分类变量取mode
new_case2 = []
for i in class_index:
L = pd.DataFrame(neighbor_group[:, i])
new_case2.append(np.array(L.mode()[0])[0])
new_case.extend(new_case1)
new_case.extend(new_case2)
case_update = pd.concat([case_update, pd.DataFrame(new_case)], axis=1)
print('已经生成了%s条新数据,完成百分之%.2f' % (times, times * 100 / amount))
times = times + 1
data_res = np.vstack((more_data, np.array(case_update.T)))
data_res = pd.DataFrame(data_res)
data_res.columns = data.columns
return data_res
一期的内容就这样吧,我感觉也没有啥好说的,都是数据分析挖掘的一些基本操作,我只是为了以后能够复用模版化了,下面贴一个全量我做预处理的过程,没啥差异,整合了一下:
from __future__ import division
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import NearestNeighbors
import sys
__author__ = 'slade_sal'
__time__ = '20171128'
def change_data_format(data):
# 以下预处理都是基于dataframe格式进行的
data_new = pd.DataFrame(data)
return data_new
# 去除空值过多的feature
def nan_remove(data, rate_base=0.4):
all_cnt = data.shape[0]
avaiable_index = []
# 针对每一列feature统计nan的个数,个数大于全量样本的rate_base的认为是异常feature,进行剔除
for i in range(data.shape[1]):
rate = np.isnan(np.array(data.iloc[:, i])).sum() / all_cnt
if rate <= rate_base:
avaiable_index.append(i)
data_available = data.iloc[:, avaiable_index]
return data_available, avaiable_index
# 离群点盖帽
def outlier_remove(data, limit_value=10, method='box', percentile_limit_set=90, changed_feature_box=[]):
# limit_value是最小处理样本个数set,当独立样本大于limit_value我们认为非可onehot字段
feature_cnt = data.shape[1]
feature_change = []
if method == 'box':
for i in range(feature_cnt):
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) >= limit_value:
q1 = np.percentile(np.array(data.iloc[:, i]), 25)
q3 = np.percentile(np.array(data.iloc[:, i]), 75)
# q3+3/2*qi为上截距点,详细百度分箱图
top = q3 + 1.5 * (q3 - q1)
data.iloc[:, i][data.iloc[:, i] > top] = top
feature_change.append(i)
return data, feature_change
if method == 'self_def':
# 快速截断
if len(changed_feature_box) == 0:
# 当方法选择为自定义,且没有定义changed_feature_box则全量数据全部按照percentile_limit_set的分位点大小进行截断
for i in range(feature_cnt):
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) >= limit_value:
q_limit = np.percentile(np.array(data.iloc[:, i]), percentile_limit_set)
data.iloc[:, i][data.iloc[:, i] > q_limit] = q_limit
feature_change.append(i)
else:
# 如果定义了changed_feature_box,则将changed_feature_box里面的按照box方法,changed_feature_box的feature index按照percentile_limit_set的分位点大小进行截断
for i in range(feature_cnt):
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) >= limit_value:
if i in changed_feature_box:
q1 = np.percentile(np.array(data.iloc[:, i]), 25)
q3 = np.percentile(np.array(data.iloc[:, i]), 75)
# q3+3/2*qi为上截距点,详细百度分箱图
top = q3 + 1.5 * (q3 - q1)
data.iloc[:, i][data.iloc[:, i] > top] = top
feature_change.append(i)
else:
q_limit = np.percentile(np.array(data.iloc[:, i]), percentile_limit_set)
data.iloc[:, i][data.iloc[:, i] > q_limit] = q_limit
feature_change.append(i)
return data, feature_change
# 空feature填充
def nan_fill(data, limit_value=10, countinuous_dealed_method='mean'):
feature_cnt = data.shape[1]
normal_index = []
continuous_feature_index = []
class_feature_index = []
continuous_feature_df = pd.DataFrame()
class_feature_df = pd.DataFrame()
# 当存在空值且每个feature下独立的样本数小于limit_value,我们认为是class feature采取one_hot_encoding;
# 当存在空值且每个feature下独立的样本数大于limit_value,我们认为是continuous feature采取mean,min,max方式
for i in range(feature_cnt):
if np.isnan(np.array(data.iloc[:, i])).sum() > 0:
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) >= limit_value:
if countinuous_dealed_method == 'mean':
continuous_feature_df = pd.concat(
[continuous_feature_df, data.iloc[:, i].fillna(data.iloc[:, i].mean())], axis=1)
continuous_feature_index.append(i)
elif countinuous_dealed_method == 'max':
continuous_feature_df = pd.concat(
[continuous_feature_df, data.iloc[:, i].fillna(data.iloc[:, i].max())], axis=1)
continuous_feature_index.append(i)
elif countinuous_dealed_method == 'min':
continuous_feature_df = pd.concat(
[continuous_feature_df, data.iloc[:, i].fillna(data.iloc[:, i].min())], axis=1)
continuous_feature_index.append(i)
elif len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) > 0 and len(
pd.DataFrame(data.iloc[:, i]).drop_duplicates()) < limit_value:
class_feature_df = pd.concat(
[class_feature_df, pd.get_dummies(data.iloc[:, i], prefix=data.columns[i])], axis=1)
class_feature_index.append(i)
else:
normal_index.append(i)
data_update = pd.concat([data.iloc[:, normal_index], continuous_feature_df, class_feature_df], axis=1)
return data_update
# onehotencoding
def ohe(data, limit_value=10):
feature_cnt = data.shape[1]
class_index = []
class_df = pd.DataFrame()
normal_index = []
# limit_value以下的均认为是class feature,进行ohe过程
for i in range(feature_cnt):
if len(pd.DataFrame(data.iloc[:, i]).drop_duplicates()) < limit_value:
class_index.append(i)
class_df = pd.concat([class_df, pd.get_dummies(data.iloc[:, i], prefix=data.columns[i])], axis=1)
else:
normal_index.append(i)
data_update = pd.concat([data.iloc[:, normal_index], class_df], axis=1)
return data_update
# smote unbalance dataset
def smote(data, tag_label='tag_1', amount_personal=0, std_rate=5, k=5,method = 'mean'):
cnt = data[tag_label].groupby(data[tag_label]).count()
rate = max(cnt) / min(cnt)
location = []
if rate < 5:
print('不需要smote过程')
return data
else:
# 拆分不同大小的数据集合
less_data = np.array(data[data[tag_label] == np.array(cnt[cnt == min(cnt)].index)[0]])
more_data = np.array(data[data[tag_label] == np.array(cnt[cnt == max(cnt)].index)[0]])
# 找出每个少量数据中每条数据k个邻居
neighbors = NearestNeighbors(n_neighbors=k).fit(less_data)
for i in range(len(less_data)):
point = less_data[i, :]
location_set = neighbors.kneighbors([less_data[i]], return_distance=False)[0]
location.append(location_set)
# 确定需要将少量数据补充到上限额度
# 判断有没有设定生成数据个数,如果没有按照std_rate(预期正负样本比)比例生成
if amount_personal > 0:
amount = amount_personal
else:
amount = int(max(cnt) / std_rate)
# 初始化,判断连续还是分类变量采取不同的生成逻辑
times = 0
continue_index = [] # 连续变量
class_index = [] # 分类变量
for i in range(less_data.shape[1]):
if len(pd.DataFrame(less_data[:, i]).drop_duplicates()) > 10:
continue_index.append(i)
else:
class_index.append(i)
case_update = pd.DataFrame()
while times < amount:
# 连续变量取附近k个点的重心,认为少数样本的附近也是少数样本
new_case = []
pool = np.random.permutation(len(location))[0]
neighbor_group = less_data[location[pool], :]
if method == 'mean':
new_case1 = neighbor_group[:, continue_index].mean(axis=0)
# 连续样本的附近点向量上的点也是异常点
if method =='random':
new_case1 =less_data[pool][continue_index] + np.random.rand()*(less_data[pool][continue_index]-neighbor_group[0][continue_index])
# 分类变量取mode
new_case2 = []
for i in class_index:
L = pd.DataFrame(neighbor_group[:, i])
new_case2.append(np.array(L.mode()[0])[0])
new_case.extend(new_case1)
new_case.extend(new_case2)
case_update = pd.concat([case_update, pd.DataFrame(new_case)], axis=1)
print('已经生成了%s条新数据,完成百分之%.2f' % (times, times * 100 / amount))
times = times + 1
data_res = np.vstack((more_data, np.array(case_update.T)))
data_res = pd.DataFrame(data_res)
data_res.columns = data.columns
return data_res
# 数据分列
def reload(data):
feature = pd.concat([data.iloc[:, :2], data.iloc[:, 4:]], axis=1)
tag = data.iloc[:, 3]
return feature, tag
# 数据切割
def split_data(feature, tag):
X_train, X_test, y_train, y_test = train_test_split(feature, tag, test_size=0.33, random_state=42)
return X_train, X_test, y_train, y_test
if __name__ == '__main__':
path = sys.argv[0]
data_all = pd.read_table(str(path))
print('数据读取完成!')
# 更改数据格式
data_all = change_data_format(data_all)
# 删除电话号码列
data_all = data_all.iloc[:, 1:]
data_all, data_avaiable_index = nan_remove(data_all)
print('空值列处理完毕!')
data_all, _ = outlier_remove(data_all)
print('异常点处理完成!')
data_all = nan_fill(data_all)
print('空值填充完成!')
data_all = ohe(data_all)
print('onehotencoding 完成!')
data_all = smote(data_all)
print('smote过程完成!')
feature, tag = reload(data_all)
X_train, X_test, y_train, y_test = split_data(feature, tag)
print('数据预处理完成!')
大家自取自用,这个也没啥好转载的,没啥干货,只是方便大家日常工作,就别转了,谢谢各位编辑大哥了。
最后,感谢大家阅读,谢谢。
欢迎大家关注我的个人bolg,更多代码内容欢迎follow我的个人Github,如果有任何算法、代码疑问都欢迎通过公众号发消息给我哦。