python按照某个字段,对目标按照给定的比例进行分层抽样

 # data : 抽样数据框,
    # df_col_partition : 分层变量名,
    # df_col_mark : 目标标识字段,
    # multiply : 非目标用户与目标用户比例
    def stratifiedSampling(data,df_col_partition,df_col_mark,multiply):
       if (data[df_col_mark] == 0).sum() / (data[df_col_mark] == 1).sum() > multiply :
        select_0 = multiply * (data[df_col_mark] == 1).sum()
      else :
        select_0 = (df_col_mark == 0).sum()
      df_0 = data[data[df_col_mark] == 0]
      df_1 = data[data[df_col_mark] == 1]
      temp = df_0.groupby(by = df_col_partition)
      temp_df = pd.DataFrame(temp.size().reset_index(name = 'cnt'))
      temp_df['select_cnt'] = round((temp_df['cnt'] / temp_df['cnt'].sum()) * select_0)
    ######
      result_sample = pd.DataFrame(columns = df_0.columns.values.tolist())
      for partition in temp_df[df_col_partition]:
        temp_tb = df_0[df_0[df_col_partition] == partition]
        temp_sampling = temp_tb.sample(n = int(temp_df[temp_df[df_col_partition] == partition]['select_cnt'].values) ,replace = False,)
        result_sample = result_sample.append(temp_sampling)
      result_sample = result_sample.append(df_1)
      return result_sample

该方法返回的是目标字段按照给定比例的一个DataFrame类型的变量

你可能感兴趣的:(数据挖掘,数据挖掘,数据分析,人工智能)