LightGBM(Light GradientBoosting Ma-
chine)是微软亚洲研究所DMTK 团队的一个开源的算法,是一种基于决策树和 GradientBoosting的改进 模 型,可以用于常见的分类、回归等问题。LightGBM 和 XGBoost算法被分别称为机器学习中的“倚天剑”和“屠龙刀”,都是非常优秀的算法。
LightGBM 有着很多的优点:使用基于直方图的算法,有着更快的训练速度和更高的效率;更少的内存占用;支持并行计算,并且由于在训练时间上的缩减而拥有处理大数据的能力。
LightGBM 有两个重要的创新点是使用直方图算法和带深度限制的 Leaf-wise的叶子生长策略:
(1)直方图算法。LightGBM 的一大创新点是基于直方图算法提出的。在计算的时候,模型会将浮点型的数值转化成离散数值,从而生成了一个直方图。将离散数值作为索引在图中累计统计量,这样的结果就是能极大地降低内存占用来进行遍历找出最佳分割点。
(2)带深度限制的 Leaf-wise的叶子生长策略。大部分的决策树使用 Level-wise策略。但是 Level-wise是一种低效算法,因为它不加区分的对待同一层的叶子,带来了很多没必要的开销
#导入所需要的主要的包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
data=pd.read_csv('LBMA-GOLD.csv',error_bad_lines=False)
data.describe(include='all')
#去除重复值
data.drop_duplicates(inplace=True)
data=data[data.loc[:,'USD (PM)'].isnull()==False]
def LGB(x_train, y_train, x_test, y_test):
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train) # 验证集
params = {'learning_rate': 0.1,
'boosting_type': 'gbdt',
'objective': 'regression_l1',
'metric': 'mape',
'min_child_samples': 46,
'min_child_weight': 0.01,
'feature_fraction': 1,
'bagging_fraction': 1,
'bagging_freq': 2,
'num_leaves': 32,
'max_depth': 8,
'n_jobs': -1,
'seed': 2019,
'verbose': -1,
}
results = {}
gbm = lgb.train(params, lgb_train, num_boost_round=5000,
valid_sets=(lgb_eval, lgb_train), valid_names=('validate', 'train'),
early_stopping_rounds=2000, evals_result=results)
y_pre = gbm.predict(x_test)
# lgb.plot_importance(gbm,importance_type="split")
# plt.show()
return y_pre
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
def Evaluate(y_test, y_pre):
MAPE = np.mean(np.abs((y_pre - y_test) / y_test))
MAE = mean_absolute_error(y_test, y_pre)
R2 = r2_score(y_test, y_pre)
MSE = mean_squared_error(y_test, y_pre)
RMSE = np.sqrt(mean_squared_error(y_test, y_pre))
M = [MAPE, MAE, R2, MSE, RMSE]
return M
#重新调整索引
data=data.reset_index(drop=True)
#归一化处理
X=data.iloc[:,1]
xmax=max(X)
xmin=min(X)
X=(X-xmin)/(xmax-xmin)
data.iloc[:,1]=X
print(data)
# 创建自变量、因变量
M = []
for i in range(8, 9):
datai = data.copy()
datai['label'] = data['USD (PM)'].shift(-i)
label = datai['USD (PM)']
attitude = []
for z in range(len(datai)):
atti = []
if z + i < len(datai):
for j in range(i):
atti.append(datai.iloc[z + j, :]['USD (PM)'])
attitude.append(atti)
# 创建训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(np.array(attitude),np.array(label[:len(label)-i]),test_size=0.2)
y_pre=LGB(x_train,y_train,x_test,y_test)
M.append(Evaluate(y_test,y_pre))
num=np.linspace(1,len(y_pre),len(y_pre))
plt.scatter(num,y_pre,label= 'p')
plt.scatter(num,y_test,label= 't')
plt.legend()
plt.show()
filename = "lgoldPre.txt"
fw = open("D:\\" + filename, encoding='utf8', mode='a+')
for i in range(9):
pri = data.iloc[i, 1] * (xmax - xmin) + xmin
fw.write(str(data.iloc[i, 0]) + '\t' + str(pri) + '\t' + str(pri) + '\t'+str(label[i])+'\n')
for z in range(1, len(attitude)):
print('---------------------------------------', z, '--------------------------------------------')
x_train = np.array(attitude[:z])
x_test = np.array([attitude[z]])
y_train = np.array(label[:z])
y_test = np.array([label[z]])
y_pre = LGB(x_train, y_train, x_test, y_test)
y_pre = y_pre[0] * (xmax - xmin) + xmin
y_test = y_test[0] * (xmax - xmin) + xmin
fw.write(str(data.iloc[8 + z, 0]) + '\t' + str(y_test) + '\t' + str(y_pre) +'\t'+str(label[8+z])+ '\n')
# for i in range(len(M)):
# print('-----------------------',i+3,'-----------------------')
# print('MAPE: ',M[i][0])
# print('MAE: ',M[i][1])
# print('R2: ',M[i][2])
# print('MSE: ',M[i][3])
# print('RMSE: ',M[i][4])
fw.close()