import random
N = 30
samples = [i for i in range(N)]
weights = [random.random() for i in range(N)]
如有 30 个样本和每个样本对应的权重,可视化如下:
plt.figure(figsize=(20,10))
rects = plt.bar(x=samples, height=weights, width=0.4, alpha=0.8, color='red', label="权重")
# 在每个条上标注数量
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.01, str(int(height*100)/100), ha="center", va="bottom")
plt.ylim(0,1.1*max(weights))
plt.xticks(samples)
plt.xlabel('类别')
plt.ylabel('权重')
plt.legend()
plt.show()
plt.figure(figsize=(10,10))
plt.pie(weights,
labels=range(N),
autopct='',
shadow=False,
startangle=0)
plt.axis('equal')
plt.show()
注意到上面的分布甚至没有进行归一化,即所有权值的和不等于1。
那么有什么办法可以对上述分布进行采样呢?比如需要获得 1000 个新样本,样本数量服从上面的任意分布。
下面就来介绍采样
方法!
其实这个方法大家都见过,就是轮盘赌
!
任意离散分布都可以画在轮盘上,重采样只需要随机地旋转轮盘即可!
def sample(weights, labels, N_new = None):
if N_new is None:
N_new = len(labels)
N = len(labels)
p = []
index = int(random.random() * N)
beta = 0.0 # 轮盘指针
mw = max(weights)
for _ in range(N_new):
beta += random.random() * 2.0 * mw
while beta > weights[index]:
beta -= weights[index]
index = (index + 1) % N
p.append(labels[index])
return p
下面用上面的函数进行1000轮重采样:
samples = resample(weights, list(set(samples)),1000)
weights = [np.sum(np.array(samples)==i) for i in range(N)]
现在,你也学会采样了吧!
上面的采样函数也可以用如下方式实现,原理是一样的!
def sample(weights, labels, N_new = None):
assert(len(weights)==len(labels))
if N_new is None:
N_new = len(labels)
N = len(labels)
p = []
cs = np.cumsum(weights) # 累计求和函数
for _ in range(N_new):
angle = random.random() * cs[-1] # 随机转动一个角度
for i in range(len(labels)):
if angle > cs[i]: # 转过第 i 项
continue
else: # 恰好落在第 i 项
p.append(labels[i])
break
assert(N_new == len(p))
return p