Pandas Note

  • 整型变量分组
# 相邻2个数字构成左开右闭区间
bins = [-1, 3, 11, 17, 29, 40, 55, 65, 80, 100]
labels = ['age_group%d' % i for i in range(len(bins) - 1)]
df['age_group'] = pd.cut(x=df['age'], bins=bins, labels=labels)

df['age_group'] = df['age_group'].astype(str)
df = df.join(pd.get_dummies(df['age_group']))
  • 分层采样
def stratified_sample_df(df, col, n_samples):
    n = min(n_samples, df[col].value_counts().min())
    df_ = df.groupby(col).apply(lambda x: x.sample(n))
    df_.index = df_.index.droplevel(0)
    return df_

你可能感兴趣的:(Practice)