友情提示(因为是实战嘛,直译就是自己实现,引用几乎没有介绍就是应付凑章节的可跳)。首先统计学中的Bootstrap就是一种获得样本的抽样方法。Bootstrap的一般的抽样方式就是“有放回地全抽”。一次性全量获取数据来训练模型或抽样那么我们就只能得到一个模型或一个分布结果。但是多次“有放回地全抽“那么就可以得到多个模型和分布结果。这样多个模型结果可以更好的实现评估,削弱过拟合的影响。多个分布结果可以帮助我们观察样本本身的分布是否均匀。
其实Why上面也已经说到了,为了对所有采样结果有更丰富的认知。机器学习上为了对单一训练数据集生成更多模型更好获得可泛化的模型。稍微多说说在机器学习上的影响。在集成学习(Ensemble learning)的范畴里 Bootstrap直接派生出了Bagging模型。其它方法或多或少都涉及重采样,只是在重新获取满足什么标准的数据上做文章。Boosting思想的核心虽然并非重采样,根据模型结果再次采样运算也是不可或缺。Stacking则是应用了不同采样集合和不同模型的排列组合。而Adaboost这种集成学习法已经在神经网络的梯度下降过程中有了广泛应用。
逼逼那么多,毕竟是实战,代码才是重点。注释都在里面了。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 18 14:52:49 2019
@author: aixoum
"""
import time
import os
import sys
import itertools
# package path
sys.path.append("/")
import pandas
import random
from collections import Counter
# a function to help use get the in group and out group index out of realidx
def myboot(df, iter_num, iter_group_num):
# const_iter_num = iter_num
in_month_list = []
out_month_list = []
print("start bootstrap sample iteration-------------------------------------")
while iter_num > 0:
# get random group idexes according to iter_group_num
# month index number double group number
group_idx = random.sample(range(0,df.shape[0]//2), iter_group_num)
month_idx = df[df.realidx.isin(group_idx)].index.tolist()
# select in_group month indexes from random group idexes
idx = 0
walk = random.randint(0,1)
in_month_idx = []
# use a window to traverse the list
for i in range(len(month_idx)//2):
in_month_idx.append(month_idx[idx+walk])
walk = random.randint(0,1)
idx += 2
assert len(in_month_idx) == iter_group_num
# select out_group month indexes from random group idexes
out_month_idx = list(set(month_idx).difference(set(in_month_idx)))
# list to receive all idex
in_month_list.extend(in_month_idx)
out_month_list.extend(out_month_idx)
# print("now it is round: %s" % str(const_iter_num-iter_num + 1))
iter_num -= 1
print("end iteration")
return in_month_list, out_month_list
def calculate(index_list,df,filed_list,sample_num):
def mean_count(index_count_map, df, filed, sample_num):
sum_up = 0
for k in index_count_map:
sum_up += df[filed][k]*index_count_map[k]
mean_num = sum_up/sample_num
return mean_num
def median_count(index_count_map, df, filed, sample_num):
new_map = {}
for k in index_count_map:
new_map[df[filed][k]]=index_count_map[k]
sort_map=dict(sorted(new_map.items(),key=lambda x:x[0]))
median_idx = sample_num//2
index_count = 0
idx = 0
for k in sort_map.keys():
index_count += sort_map[k]
idx += 1
median_num_left = k
if index_count >= median_idx:
break
median_num_right = list(sort_map.keys())[idx]
return (median_num_left + median_num_right)/2
def std_count(index_count_map, df, filed, sample_num, mean):
new_map = {}
for k in index_count_map:
new_map[df[filed][k]]=index_count_map[k]
variance = 0
for num in new_map:
variance += (num - mean) ** 2 *new_map[num]
variance /= sample_num
std = variance ** 0.5
return std
index_count_map = Counter(index_list)
res_mean = 0
res_median = 0
res_std = 0
for field in field_list:
res_mean += mean_count(index_count_map, df, field,sample_num)
res_median += median_count(index_count_map, df, field, sample_num)
res_std = std_count(index_count_map, df, field, sample_num, res_median)
return res_mean, res_median, res_std
if __name__ == "__main__":
start=time.time()
print("start to run program ==============================================")
# change the path to your own directory
os.chdir("/")
path = sys.argv[3]
df = pandas.read_excel(path)
# display the head of the data
# df.head()
# shape of data
print("load data %s" % path)
print("the shape of data is %s * %s" % (str(df.shape[0]),str(df.shape[1])))
print("recreate index for the data")
# create new index
real_idx = [[i]*2 for i in range(df.shape[0]//2)]
real_idx = [i for y in real_idx for i in y]
df["realidx"] = real_idx
field_list = list(df.columns)
# get iteration and combinations
field_list_combs = [ itertools.combinations(field_list,i) for i in range(1, len(field_list))]
field_list_combs = [list(i) for y in field_list_combs for i in y]
# idx_set = list(set(df.realidx))
"""
@params:
df:DataFrame the data file
sys.argv[1]:String iteration number
sys.argv[2]:String sample number
"""
iter_num = int(sys.argv[1])
iter_group_num = int(sys.argv[2])
in_month_list, out_month_list = myboot(df, iter_num, iter_group_num)
all_list = df.index.tolist()
sample_num = iter_num * iter_group_num
with open("res.csv", "a") as f:
# this is csv head
f.write("filed_combinations\tin_group_mean\tin_group_median\tin_group_std\t")
f.write("out_group_mean\tout_group_median\tout_group_std\t")
f.write("all_group_mean\tall_group_median\tall_group_std\t\n")
while field_list_combs != []:
field_list = field_list_combs.pop()
in_mean, in_median, in_std = calculate(in_month_list,df,field_list,sample_num)
out_mean, out_median, out_std = calculate(out_month_list,df,field_list,sample_num)
mean, median, std = calculate(all_list,df,field_list,df.shape[0])
print("for field %s------------------------------------------------" % " ".join(field_list))
print("for in_group samples:")
print("the mean is %s\nthe median is %s\nthe std is %s\n" % (str(in_mean), str(in_median), str(in_std)))
print("for out_group samples:")
print("the mean is %s\nthe median is %s\nthe std is %s\n" % (str(out_mean), str(out_median), str(out_std)))
print("for all samples:")
print("the mean is %s\nthe median is %s\nthe std is %s\n" % (str(mean), str(median), str(std)))
f.write(" ".join(field_list)+"\t")
f.write(str(in_mean)+"\t")
f.write(str(in_median)+"\t")
f.write(str(in_std)+"\t")
f.write(str(out_mean)+"\t")
f.write(str(out_median)+"\t")
f.write(str(out_std)+"\t")
f.write(str(mean)+"\t")
f.write(str(median)+"\t")
f.write(str(std)+"\n")
end=time.time()
print('======================================running time: %s seconds'%(end-start))
real_idx = [[i]*2 for i in range(df.shape[0]//2)]
real_idx = [i for y in real_idx for i in y]
df["realidx"] = real_idx
field_list = list(df.columns)
# get iteration and combinations
field_list_combs = [ itertools.combinations(field_list,i) for i in range(1, len(field_list))]
field_list_combs = [list(i) for y in field_list_combs for i in y]
Expression1:List1 = {A:[0,1],B:[0,1],C[0,1]}
再加上局部索引对应的原始索引[0,1,2,3,4,5]就是。
Expression2:List2 = {A:[[0[0] 1[2]],B:[0[2],1[3]],C[0[4],1[5]]}
切换成为二维数组就是下面这样A,B.C对应0,1,2。所以要实现的话肯定就是双指针啦。
Expression3:List3 = [[0,1][2,3][4,5]]
[0,1,2,3]中第一个索引值0对应就是
List3[0][0]
原始索引我需要从做到右位移三次扫描覆盖所有的分组。每次位移在当下可能性区间[0,1]中取一个值然后移动到下一个区间。回到Expression1再break down。假设Idex 数组是我们回收索引值的数组。
运算开始
->>>Start
->>>>>>>>位置A:可能性区间[0,1] 随机取0
->>>>>>>>>>>Index[0] = List3[0][0] = 0
->>>>>>>>>>>>>>>位置B:可能性区间[0,1] 随机取0
->>>>>>>>>>>>>>>>>>>>> Index[1] = List3[0][0] = 2
->>>>>>>>>>>>>>>>>>>>>>>>>>位置C:可能性区间[0,1]随机取1
->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Index[2] = List3[0][1] = 6
那么ingroup局部索引就是[0,0,1]
对应全局索引就是[0,2,6]
在实现的过程中是需要一个指针进行全局位置标识另一个进行局部位置标示的。但是从上面拿个随手写的运行图可以看出时间和空间成本都压缩的比较低。全局位置标识是idx,步长是2位移,因为客户就是2个样本一组。walk决定[0,1]随机数, 也就是局部位置标示。
# select in_group month indexes from random group idexes
idx = 0
walk = random.randint(0,1)
in_month_idx = []
# use a window to traverse the list
for i in range(len(month_idx)//2):
in_month_idx.append(month_idx[idx+walk])
walk = random.randint(0,1)
idx += 2
def median_count(index_count_map, df, filed, sample_num):
new_map = {}
for k in index_count_map:
new_map[df[filed][k]]=index_count_map[k]
sort_map=dict(sorted(new_map.items(),key=lambda x:x[0]))
median_idx = sample_num//2
index_count = 0
idx = 0
for k in sort_map.keys():
index_count += sort_map[k]
idx += 1
median_num_left = k
if index_count >= median_idx:
break
median_num_right = list(sort_map.keys())[idx]
return (median_num_left + median_num_right)/2
空缺,真的没有比这更干的货了。这篇文章里面的内容和代码除了词语和关键字是引用,其余都是风干。代码是帮别人写的,不知道会不会查重。使用请标注。