aggs_num = {'num': ['mean','var', 'max', 'min', 'skew', 'median', 'Q1', 'Q2']} # Q1是上四分位数,Q2是下四分位数
aggs_cat = {'cat': ['mean', 'var', 'max', 'min', 'median', 'count', 'nunique', 'Q1', 'Q2']}
上述统计指标只适用于一般情况,如果连续变量取值个数较少(只有十几个或者几十个不同取值)时,也可以使用 count, nunique 等指标;如果分类变量取值个数较多(超过5个),则也可以使用偏度。注意:Q1和Q2是pandas agg中不存在的映射函数,因此需要自定义。
为了能够更好地适用不同情况,该函数需要单独输入连续变量和离散变量,同时也可以自定义不同类别变量分组统计的统计量。
def Binary_Group_Statistics(keyCol,
features,
col_num = None,
col_cat = None,
num_stat = ['mean','var', 'max', 'min', 'skew', 'median'],
cat_stat = ['mean','var', 'max', 'min', 'median', 'count', 'nunique'],
quant = True):
'''
双变量分组统计特征衍生函数
:param keyCol:用于分组的关键变量, groupby(keyCol)
:param features:原始数据集, DataFrame
:param col_num:参与衍生的连续型变量, list
:param col_cat:参与衍生的离散型变量, list
:param num_stat:连续变量分组统计量
:param cat_num:离散变量分组统计量
:param quant:是否计算分位数(自定义统计量)
:return:交叉衍生后的新特征和新特征的名称
'''
# 输入的特征有连续型特征
if col_num != None:
aggs_num = {} # 作为agg()函数的参数 -> {'income': ['min', 'max', ...]}
colNames = col_num
# 创建agg方法所需字典
for col in col_num: aggs_num[col] = num_stat
# 创建数值衍生特征名称列表
cols_num = [keyCol]
for key in aggs_num.keys(): cols_num.extend([key + '_' + keyCol + '_' + stat for stat in aggs_num[key]])
# 创建数值衍生特征对应的df
features_num_new = features[col_num + [keyCol]].groupby(keyCol).agg(aggs_num).reset_index()
features_num_new.columns = cols_num
# 当输入的特征有连续型也有离散型特征时
if col_cat != None:
aggs_cat = {}
colNames = col_num + col_cat
# 创建agg方法所需字典
for col in col_cat: aggs_cat[col] = cat_stat
# 创建离散衍生特征名称列表
cols_cat = [keyCol]
for key in aggs_cat.keys(): cols_cat.extend([key + '_' + keyCol + '_' + stat for stat in aggs_cat[key]])
# 创建离散衍生特征对应的df
features_cat_new = features[col_cat + [keyCol]].groupby(keyCol).agg(aggs_cat).reset_index()
features_cat_new.columns = cols_cat
# 合并连续变量衍生结果和离散变量衍生结果
df_temp = pd.merge(features_num_new, features_cat_new, how = 'left', on = keyCol)
features_new = pd.merge(features[keyCol], df_temp, how = 'left', on = keyCol)
features_new.loc[:, ~features_new.columns.duplicated()] # 去掉重复的列名
colNames_new = cols_num + cols_cat
colNames_new.remove(keyCol)
colNames_new.remove(keyCol)
# 只有连续变量
else:
# merge连续变量衍生结果与原始数据,然后删除重复列
features_new = pd.merge(features[keyCol], features_num_new, how = 'left', on = keyCol)
features_new.loc[:, ~features_new.columns.duplicated()]
colNames_new = cols_num
colNames_new.remove(keyCol)
# 输入的特征没有连续型特征
else:
# 存在离散变量
if col_cat != None: aggs_cat = {}
colNames = col_cat
for col in col_cat: aggs_cat[col] = cat_stat
cols_cat = [keyCol]
for key in aggs_cat.keys(): cols_cat.extend([key + '_' + keyCol + '_' + stat for stat in aggs_cat[key]])
features_cat_new = features[col_cat + [keyCol]].groupby(keyCol).agg(aggs_cat).reset_index()
features_cat_new.columns = cols_cat
features_new = pd.merge(features[keyCol], features_cat_new, how = 'left', on = keyCol)
features_new.loc[:, ~features_new.columns.duplicated()]
colNames_new = cols_cat
colNames_new.remove(keyCol)
# 自定义统计函数
if quant:
# 定义四分位计算函数
def q1(x):
'''
下四分位数
'''
return x.quantile(0.25)
def q2(x):
'''
上四分位数
'''
return x.quantile(0.75)
aggs = {}
for col in colNames: aggs[col] = ['q1', 'q2'] # 第一次获取列名
cols = [keyCol]
for key in aggs.keys(): cols.extend([key + '_' + keyCol + '_' + stat for stat in aggs[key]])
aggs = {}
for col in colNames: aggs[col] = [q1, q2] # 第二次计算grouby
features_temp = features[colNames + [keyCol]].groupby(keyCol).agg(aggs).reset_index()
features_temp.columns = cols
features_new = pd.merge(features_new, features_temp, how = 'left', on = keyCol)
features_new.loc[:, ~features_new.columns.duplicated()]
colNames_new = colNames_new + cols
colNames_new.remove(keyCol)
features_new.drop([keyCol], axis = 1, inplace = True)
return features_new, colNames_new
测试代码:
features = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv') # 7043 rows × 21 columns 原始数据集
col_num = ['MonthlyCharges'] # 连续变量列表
col_cat = ['SeniorCitizen'] # 离散变量列表
keyCol = 'tenure' # 分组变量
df, col = Binary_Group_Statistics(keyCol, features, col_num, col_cat)
df.head(5)