import numpy as np
x = np.loadtxt("input/train.txt", dtype=np.float64)
y = np.loadtxt("label/train.txt", dtype=np.float64)
print('loaded')
n,dx=x.shape
_,dy=y.shape
output=np.zeros_like(y)
from base.lib_grad_hess import func_grad_hess
grad,hess=func_grad_hess(output,y)
print('pre down')
class Tree(object):
def __init__(self):
self.root=Node()
def train(self,n,dyi):
self.root.id_list = list(range(n))
self.root.dfs(dyi,list(range(dx)),0)
def test(self,n_v,dyi):
self.root.id_list = list(range(n_v))
self.root.test_dfs(dyi)
import random
import math
from base.lib_hypara import INF
lam=1
gam=0.1
max_depth=5
min_sample=3
min_child_weight = 2
min_gain=0.00
lr=0.1
class Node(object):
def __init__(self):
self.is_leaf=None
self.leaf_weight=None
self.left = None
self.right = None
self.id_list = []
self.classify = None
def dfs(self,dyi,chra_list,depth):
G, H = 0., 0.
for i in self.id_list:
G += grad[i][dyi]
H += hess[i][dyi]
flag = False
if depth>=max_depth or len(self.id_list) < min_sample:
flag = True
if len(self.id_list) < min_sample:
print('m:',len(self.id_list))
else:
print('depth:',depth,len(self.id_list))
else:
gain,max_classify,G,H=self.split_finding(dyi, chra_list)
if gain<min_gain:
flag=True
print('gain:', gain, len(self.id_list))
if flag:
self.is_leaf=True
self.leaf_weight=-G/(H+lam)
self.leaf_weight*=lr
for i in self.id_list:
output[i][dyi]+=self.leaf_weight
else:
self.is_leaf=False
self.classify=max_classify
self.func_classify()
chra_list.remove(self.classify[0])
self.left.dfs(dyi,chra_list,depth+1)
self.right.dfs(dyi,chra_list,depth+1)
self.id_list=[]
def test_dfs(self,dyi):
if self.is_leaf:
for i in self.id_list:
output_v[i][dyi]+=self.leaf_weight
self.id_list=[]
else:
self.func_classify()
self.left.test_dfs(dyi)
self.right.test_dfs(dyi)
def func_classify(self):
if self.left==None:self.left = Node()
if self.right==None:self.right = Node()
k,v,b=self.classify
for i in self.id_list:
if x[i][k]==INF:
if b == 0:
self.left.id_list.append(i)
else:
self.right.id_list.append(i)
else:
if x[i][k]<=v:
self.left.id_list.append(i)
else:
self.right.id_list.append(i)
def split_finding(self,dyi,chra_list):
select_chra_list=random.sample(chra_list,math.ceil(math.sqrt(len(chra_list))))
max_score=0
max_classify=None
G,H=0,0
for i in self.id_list:
G+=grad[i][dyi]
H+=hess[i][dyi]
for k in select_chra_list:
def takek(elem):
return x[elem][k]
self.id_list.sort(key=takek)
Glack,Hlack=0,0
p=len(self.id_list)-1
while(p>=0 and takek(self.id_list[p])==INF):
Glack += grad[self.id_list[p]][dyi]
Hlack += hess[self.id_list[p]][dyi]
p-=1
p+=1
GL,HL=0,0
for pj in range(p):
GL+=grad[self.id_list[pj]][dyi]
HL+=hess[self.id_list[pj]][dyi]
if pj+1<p and takek(self.id_list[pj])==takek(self.id_list[pj+1]):
continue
v=takek(self.id_list[pj])
GR,HR=G-GL,H-HL
score=GL**2/(HL+lam)+GR**2/(HR+lam)-G**2/(H+lam)
if score>max_score:
max_score=score
max_classify=(k,v,1)
GL+=Glack
HL+=Hlack
GR,HR=G-GL,H-HL
score=GL**2/(HL+lam)+GR**2/(HR+lam)-G**2/(H+lam)
if score>max_score:
max_score=score
max_classify=(k,v,0)
GL-=Glack
HL-=Hlack
gain = max_score / 2 - gam
return gain,max_classify,G,H
classifier=[[]for i in range(dy)]
from base.lib_evaluate import evaluate_acc
acc_results=[]
import numpy as np
x_v = np.loadtxt("input/valid.txt", dtype=np.float64)
y_v = np.loadtxt("label/valid.txt", dtype=np.float64)
n_v=x_v.shape[0]
output_v=np.zeros((n_v,dy))
pred_result_v=[]
from base.lib_evaluate import evaluate_acc
acc_results_v=[]
B = 3
for epoch in range(B):
print('epoch:',epoch)
for i in range(dy):
print('tree:', i)
classifier[i].append(Tree())
classifier[i][-1].train(n,i)
print('caculating grad hess')
grad,hess=func_grad_hess(output,y)
print('caculating acc')
acc=evaluate_acc(y,output)
print('epoch:{}\tacc:{}'.format(epoch,acc))
acc_results.append(acc)
for j in range(dy):
classifier[j][epoch].test(n_v,j)
acc_v = evaluate_acc(y_v, output_v)
print('valid:',acc_v)
acc_results_v.append(acc_v)
prediction_v=np.argmax(output_v, axis=1)
pred_result_v.append(prediction_v)
print('over')
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 10))
plt.plot(range(len(acc_results_v)),acc_results_v)
plt.show()
plt.gcf().savefig('figure\\valid.png')
import pandas as pd
df=pd.DataFrame(pred_result_v).T
df.to_csv('output/valid行样本列树数.csv')
df2=pd.DataFrame(pred_result_v[-1])
df2.to_csv('output/valid_last.csv',index=False)
prediction=np.argmax(output, axis=1)
np.savetxt("output/train.txt",prediction)
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 10))
plt.plot(range(len(acc_results)),acc_results)
plt.show()
plt.gcf().savefig('figure\\train.png')
from base.lib_store import serialize
print('storing')
serialize(classifier,'store\\classifier.bin')
print('done')