1.pandas中的列的分位数
# 查看列的分位数
import pandas as pd
# set columns type
my_df['col'] = my_df['col'].astype(np.float64)
# computations for 4 quantiles : quartiles
bins_col = pd.qcut(my_df['col'], 4)
bins_col_label = pd.qcut(my_df['col'], 4).labels
2.多重聚合(组函数)
# 多重聚合(组函数)
# columns settings
grouped_on = 'col_0' # ['col_0', 'col_2'] for multiple columns
aggregated_column = 'col_1'
### Choice of aggregate functions
## On non-NA values in the group
## - numeric choice :: mean, median, sum, std, var, min, max, prod
## - group choice :: first, last, count
# list of functions to compute
agg_funcs = ['mean', 'max']
# compute aggregate values
aggregated_values = my_df.groupby(grouped_on)[aggregated_columns].agg(agg_funcs)
# get the aggregate of group
aggregated_values.ix[group]
3.使用自定义函数进行聚合
# 使用自定义函数进行聚合
# columns settings
grouped_on = ['col_0']
aggregated_columns = ['col_1']
def my_func(my_group_array):
return my_group_array.min() * my_group_array.count()
## list of functions to compute
agg_funcs = [my_func] # could be many
# compute aggregate values
aggregated_values = my_df.groupby(grouped_on)[aggregated_columns].agg(agg_funcs)
4.在聚合的dataframe上使用apply
# 在聚合的dataframe上使用apply
# top n in aggregate dataframe
def top_n(group_df, col, n=2):
bests = group_df[col].value_counts()[:n]
return bests
# columns settings
grouped_on = 'col_0'
aggregated_column = 'col'
grouped = my_df.groupby(grouped_on)
groups_top_n = grouped.apply(top_n, aggregated_column, n=3)
5.移动平均
# 移动平均
import numpy as np
ret = np.cumsum(np.array(X), dtype=float)
ret[w:] = ret[w:] - ret[:-w]
result = ret[w - 1:] / w
# X: array-like
# window: int
6.组数据的基本信息
# 组数据的基本信息
# columns settings
grouped_on = 'col_0' # ['col_0', 'col_1'] for multiple columns
aggregated_column = 'col_1'
### Choice of aggregate functions
## On non-NA values in the group
## - numeric choice : mean, median, sum, std, var, min, max, prod
## - group choice : first, last, count
## On the group lines
## - size of the group : size
aggregated_values = my_df.groupby(grouped_on)[aggregated_column].mean()
aggregated_values.name = 'mean'
# get the aggregate of group
aggregated_values.ix[group]
7.数据组的遍历
# 数据组的遍历
# columns settings
grouped_on = 'col_0' # ['col_0', 'col_1'] for multiple columns
grouped = my_df.groupby(grouped_on)
i = 0
for group_name, group_dataframe in grouped:
if i > 10:
break
i += 1
print(i, group_name, group_dataframe.mean()) ## mean on all numerical columns
8.最大互信息数
# 最大互信息数
import numpy as np
matrix = np.transpose(np.array(X)).astype(float)
mine = MINE(alpha=0.6, c=15, est="mic_approx")
mic_result = []
for i in matrix[1:]:
mine.compute_score(t_matrix[0], i)
mic_result.append(mine.mic())
return mic_result
最大互信息数
9.pearson相关系数
import numpy as np
matrix = np.transpose(np.array(X))
np.corrcoef(matrix[0], matrix[1])[0, 1]
# X: array-like
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.corrcoef.html
10.自定义聚合函数
# 自定义聚合函数
def zscore(x):
return (x - x.mean()) / x.std()
my_df['zscore_col'] = my_df.groupby(grouped_on)[aggregated_column].transform(zscore)
11.标准聚合使用groupby
# 标准聚合使用groupby
# columns settings
grouped_on = 'col_1'
aggregated_column = 'col_0'
### Choice of aggregate functions
## On non-NA values in the group
## - numeric choice : mean, median, sum, std, var, min, max, prod
## - group choice : first, last, count
my_df['aggregate_values_on_col'] = my_df.groupby(grouped_on)[aggregated_column].transform(lambda v: v.mean())
12.使用自定义函数设值
# 使用自定义函数设值
def to_log(v):
try:
return log(v)
except:
return np.nan
my_df['new_col'] = my_df['col_0'].map(to_log)
13.使用复杂函数设值
# 使用复杂的函数设值
import numpy as np
def complex_formula(col0_value, col1_value):
return "%s (%s)" % (col0_value, col1_value)
my_df['new_col'] = np.vectorize(complex_formula)(my_df['col_0'], my_df['col_1'])
使用复杂函数设值
14.使用字典dict设值
# 使用字典dict设值
gender_dict={'男':1,'女':2}
df['gender'] = df['gender'].map(gender_dict)
使用字典设值参考信息:https://www.kesci.com/