本文是根据李航的《统计学习方法》进行复现,按照二分类的Adaboost进行操作(当然现在也已经有了多分类的改进算法)。
集成学习算法往往可以分成Boosting族和Bagging族两大类,他们的区别主要就是Boosting是类似于串行的结构,每个基分类器之间有着强依存关系,而Bagging一族则是并行式的集成算法。
Adaboost作为集成学习的入门算法,简单有效,是Boosting一族算法中的重要代表。
他的原理其实很简单,就是通过寻找最优的阈值来进行分类,分类错误的权值变大,在下次分类时分类器为了得到较低的误差率会更倾向于把他分类正确。
1. 初始化数据集权值分布
D 1 = ( w 11 , . . . , w 1 i , w 1 N ) , . . . , w 1 i = 1 N , i = 1 , 2 , . . . , N D_1 = (w_{11},...,w_{1i},w_{1N}),...,w_{1i}=\frac{1}{N},i=1,2,...,N D1=(w11,...,w1i,w1N),...,w1i=N1,i=1,2,...,N
2. 对 m = 1 , 2 , . . . , M m=1,2,...,M m=1,2,...,M
(a)使用具有权值分布 D m D_m Dm的训练数据集学习,得到基分类器
G m ( x ) : X → { − 1 , + 1 } G_m(x):X\rightarrow \{-1,+1\} Gm(x):X→{−1,+1}
(b)计算 G m ( x ) G_m(x) Gm(x)在训练数据集上的分类误差率
e m = P ( G m ( x i ) ≠ y i ) = ∑ i = 1 N w m i I ( G m ( x i ) ≠ y i ) e_m=P(G_m(x_i)≠y_i)=\sum_{i=1}^Nw_{mi}I(G_m(x_i)≠y_i) em=P(Gm(xi)=yi)=∑i=1NwmiI(Gm(xi)=yi)
(c)计算 G m ( x ) G_m(x) Gm(x)的系数
α m = 1 2 l n 1 − e m e m \alpha_m=\frac{1}{2}ln\frac{1-e_m}{e_m} αm=21lnem1−em
(d)更新训练数据集的权值分布
D m + 1 = ( w m + 1 , 1 , . . . , w m + 1 , i , . . . , w m + 1 , N ) D_{m+1}=(w_{m+1,1},...,w_{m+1,i},...,w_{m+1,N}) Dm+1=(wm+1,1,...,wm+1,i,...,wm+1,N)
w m + 1 , i = w m i Z m e − α m y i G m ( x i ) w_{m+1,i}=\frac{w_{mi}}{Z_m}e^{-\alpha_my_iG_m(x_i)} wm+1,i=Zmwmie−αmyiGm(xi)
tip: Z m Z_m Zm是规范化因子, Z m = ∑ i = 1 N w m i e − α m y i G m ( x i ) Z_m=\sum_{i=1}^Nw_{mi}e^{-\alpha_my_iG_m(x_i)} Zm=∑i=1Nwmie−αmyiGm(xi)
3. 构建基本分类器的线性组合
f ( x ) = ∑ m − 1 M α m G m ( x ) f(x)=\sum_{m-1}^M\alpha_mG_m(x) f(x)=∑m−1MαmGm(x)
得到最终分类器
G ( x ) = s i g n ( f ( x ) ) = s i g n ( ∑ m − 1 M α m G m ( x ) ) G(x)=sign(f(x))=sign(\sum_{m-1}^M\alpha_mG_m(x)) G(x)=sign(f(x))=sign(∑m−1MαmGm(x))
定义函数如下:
def get_data(col,fold): #通过分层采样划分训练集和测试集
fpath = r'your files' ###
object = pd.read_excel(fpath,header=None)
dataSet = object.iloc[:, 0:col]
trans_dataSet = dataSet.values.tolist()
labels = []
total_size = len(trans_dataSet)
for i in range(0,total_size):
labels.append(trans_dataSet[i][-1])
labels = np.unique(labels) #获取所有标签值
sampling = [[] for i in range(0,len(labels))]
for i in range(0, total_size):
for k in range(0,len(labels)):
if trans_dataSet[i][-1] == labels[k]:
sampling[k].append(trans_dataSet[i])
train_data = []
test_data = []
for i in range(0, len(labels)):
for k in range(0,len(sampling[i])):
random.shuffle(sampling[i]) # shuffle()方法将序列的所有元素随机排序。
test_size = len(sampling[i]) // 10
for j in range((fold-1)*test_size,fold*test_size):
test_data.append(sampling[i][j])
for j in range(0,(fold-1)*test_size):
train_data.append(sampling[i][j])
for j in range(fold*test_size,len(sampling[i])):
train_data.append(sampling[i][j])
train_size = len(train_data)
test_size = len(test_data)
return train_data,test_data,train_size,test_size
def divi(train_data,col): #选出划分阈值
a = [[] for i in range(col)]
blist = [[] for i in range(col)]
for x in train_data:
for i in range(0,col):
a[i].append(x[i])
a[i] = list(set(a[i]))
for i in range(0,col):
sorted(a[i])
for i in range(0,col):
for k in range(0,len(a[i])-1):
blist[i].append((a[i][k]+a[i][k+1])/2)
return blist
def train(train_data,test_data,y1,y2,blist,w1,w2,col,n1,n2): #在该权重分布下进行训练并输出该次训练精度
cc = [] #获取每个特征列下最小误差率的阈值信息
slist = []
for i in range(0,col):
box = [] #存放该阈值下分类情况的信息,包括误差率、特征列、阈值、阈值分类情况
for k in range(0,len(blist[i])): #先假定阈值以左为-1,阈值以右为1
s = blist[i][k] #s为当前阈值
count = 0
flag = 0
for j in range(0,n1):
if train_data[j][i] < s:
count += (y1[j] != -1) * w1[j]
else:
count += (y1[j] != 1) * w1[j]
if count > 0.5: #若误差率大于0.5,则实际情况为阈值以左为1,阈值以右为-1,更正误差率
count = 0
flag = 1
for j in range(0,n1):
if train_data[j][i] < s:
count += (y1[j] != 1) * w1[j]
else:
count += (y1[j] != -1) * w1[j]
box.append([count,i,k,flag])
cc.append(min(box))
mark = min(cc) #获取该权重分布下全体最小误差率阈值的信息
error1 = mark[0] #当前权重分布下最小误差率
#print(mark) #mark[1],mark[2]对应该分类器下最佳阈值的i,k值。 mark[3]作为阈值是否服从原假设情形的标记。
gx1 = []
s = blist[mark[1]][mark[2]]
slist.append(s) # 存储该次分类的最佳阈值
if mark[3] == 0:
for i in range(0,n1):
if train_data[i][mark[1]] < s:
gx1.append(-1)
else:
gx1.append(1)
if mark[3] == 1:
for i in range(0,n1):
if train_data[i][mark[1]] < s:
gx1.append(1)
else:
gx1.append(-1)
alpha1 = 1/2*np.log((1-error1)/(error1)) #计算该分类器系数
z_m_1 = 0
for i in range(0,n1):
z_m_1 += w1[i]*np.exp(-alpha1*y1[i]*gx1[i])
for m in range(0,n1): #更新训练集权值分布
w1[m] = w1[m]/z_m_1*np.exp(-alpha1*y1[m]*gx1[m])
# 通过上述得到的阈值对测试集进行划分,计算误差率
gx2 = []
error2 = 0
for i in range(0, n2):
if test_data[i][mark[1]] < s:
gx2.append(-1)
error2 += w2[i] * (y2[i] != gx2[i])
else:
gx2.append(1)
error2 += w2[i] * (y2[i] != gx2[i])
if error2 > 0.5:
gx2 = []
error2 = 0
for i in range(0, n2):
if test_data[i][mark[1]] < s:
gx2.append(1)
error2 += w2[i] * (y2[i] != gx2[i])
else:
gx2.append(-1)
error2 += w2[i] * (y2[i] != gx2[i])
try:
error2 == 0
except:
print("ZeroDivisionError")
else:
error2 += 0.00001
alpha2 = 1/2 * np.log((1 - error2)/(error2)) # 计算该分类器系数
z_m_2 = 0
for i in range(0, n2):
z_m_2 += w2[i] * np.exp(-alpha2 * y2[i] * gx2[i])
for m in range(0, n2): # 更新训练集权值分布
w2[m] = w2[m] / z_m_2 * np.exp(-alpha2 * y2[m] * gx2[m])
return w1,w2,gx2,alpha2
def adaboost(train_data,test_data,blist,w1,w2,col,multi_num): #进行adaboost集成训练
n1 = len(train_data)
n2 = len(test_data)
y1 = []
y2 = []
for i in range(0,n1):
y1.append(2*train_data[i][col]-3) ###
for i in range(0,n2):
y2.append(2*test_data[i][col]-3) ###
fn2 = [0] * n2
for i in range(multi_num):
w1 , w2 , gx2 , alpha2 = train(train_data,test_data,y1,y2,blist,w1,w2,col,n1,n2)
for j in range(0,n2):
fn2[j] += alpha2 * gx2[j]
for j in range(0,n2):
if fn2[j] > 0:
fn2[j] = 1
else:
fn2[j] = -1
rati2 = 0
for l in range(0,n2):
if fn2[l] == y2[l]:
rati2 += 1
acc2 = rati2/n2
return acc2