该部分用于文本分类任务的基础实现,主要包括机器学习(ml)和深度学习(dl)两大部分,机器学习部分基于sklearn/lightgbm包实现,深度学习部使用pytorch深度学习框架。
机器学习部分主要包含特征工程和分类器两大部分:
特征工程部分主要针对文本分类任务的hash/lsa/lda/doc2vec特征提取/特征选择/特征组合/特征构造进行了实现;
分类器部分主要有逻辑回归/SVM/随机森林/Bagging/Adaboost/GBDT /Xgboost /LightGBM等。
本文主要接上文介绍分类器的使用,两者联系起来可跑通,完整项目代码在上篇博客有链接。
# -*- coding: utf-8 -*-
"""
@brief : 配置文件,主要用于配置机器学习模型使用哪种特征和机器学习算法
@How to use : 修改features_path用于选择使用哪种特征;
修改clf_name用于选择使用哪种学习算法;
可在clfs_dict中对学习算法的超参数进行修改;
"""
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
"""是否开启验证集模式"""
status_vali = True
"""特征存储的路径"""
features_path = '../features/data_ensemble_spar.pkl'
"""修改clf_name可对学习算法进行选择;
修改base_clf改变集成学习的基分类器"""
clf_name = 'svm'
base_clf = LinearSVC()
clfs = {
'lr': LogisticRegression(penalty='l2', C=1.0),
'svm': LinearSVC(penalty='l2', dual=True),
'bagging': BaggingClassifier(base_estimator=base_clf, n_estimators=60, max_samples=1.0, max_features=1.0, random_state=1,
n_jobs=1, verbose=1),
'rf': RandomForestClassifier(n_estimators=10, criterion='gini'),
'adaboost': AdaBoostClassifier(base_estimator=base_clf, n_estimators=50),
'gbdt': GradientBoostingClassifier(),
'xgb': xgb.XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=100, silent=True, objective='multi:softmax',
nthread=1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0,
missing=None),
'lgb': lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=250,
max_bin=255, subsample_for_bin=200000, objective=None, min_split_gain=0.0, min_child_weight=0.001,
min_child_samples=20, subsample=1.0, subsample_freq=1, colsample_bytree=1.0, reg_alpha=0.0,
reg_lambda=0.5, random_state=None, n_jobs=-1, silent=True)
}
clf = clfs[clf_name]
# -*- coding: utf-8 -*-
"""
@brief : 根据features_path中的数据,对机器学习模型进行训练,并对测试集进行预测,并将结果保存至本地
@How to use:使用前,先对sklearn_config文件进行参数配置,然后才能运行此文件进行学习训练
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import time
import pickle
from sklearn_config import features_path, clf_name, clf, status_vali
t_start = time.time()
"""=====================================================================================================================
1 读取数据
"""
data_fp = open(features_path, 'rb')
x_train, y_train, x_test = pickle.load(data_fp)
data_fp.close()
"""划分训练集和验证集,验证集比例为test_size"""
if status_vali:
x_train, x_vali, y_train, y_vali = train_test_split(x_train, y_train, test_size=0.1, random_state=0)
"""=====================================================================================================================
2 训练分类器
"""
clf.fit(x_train, y_train)
"""=====================================================================================================================
3 在验证集上评估模型
"""
if status_vali:
pre_vali = clf.predict(x_vali)
score_vali = f1_score(y_true=y_vali, y_pred=pre_vali, average='macro')
print("验证集分数:{}".format(score_vali))
"""=====================================================================================================================
4 对测试集进行预测;将预测结果转换为官方标准格式;并将结果保存至本地
"""
y_test = clf.predict(x_test) + 1
df_result = pd.DataFrame(data={'id':range(102277), 'class': y_test.tolist()})
result_path = '../results/' + features_path.split('/')[-1] + '_sklearn_' + clf_name + '.csv'
df_result.to_csv(result_path, index=False)
t_end = time.time()
print("训练结束,耗时:{}min".format((t_end - t_start) / 60))
# -*- coding: utf-8 -*-
"""
@brief : lgb算法
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import time
import pickle
import lightgbm as lgb
t_start = time.time()
"""=====================================================================================================================
0 自定义验证集的评价函数
"""
def f1_score_vali(preds, data_vali):
labels = data_vali.get_label()
preds = np.argmax(preds.reshape(20, -1), axis=0)
score_vali = f1_score(y_true=labels, y_pred=preds, average='macro')
return 'f1_score', score_vali, True
"""=====================================================================================================================
1 读取数据,并转换到lgb的标准数据格式
"""
features_path = '../features/data_ensemble.pkl'
data_fp = open(features_path, 'rb')
x_train, y_train, x_test = pickle.load(data_fp)
data_fp.close()
"""划分训练集和验证集,验证集比例为test_size"""
x_train, x_vali, y_train, y_vali = train_test_split(x_train, y_train, test_size=0.1, random_state=0)
d_train = lgb.Dataset(data=x_train, label=y_train)
d_vali = lgb.Dataset(data=x_vali, label=y_vali)
"""=====================================================================================================================
2 训练lgb分类器
"""
params = {
'boosting': 'gbdt',
'application': 'multiclass',
'num_class': 20,
'learning_rate': 0.1,
'num_leaves':31,
'max_depth':-1,
'lambda_l1': 0,
'lambda_l2': 0.5,
'bagging_fraction' :1.0,
'feature_fraction': 1.0
}
bst = lgb.train(params, d_train, num_boost_round=800, valid_sets=d_vali,feval=f1_score_vali, early_stopping_rounds=None,
verbose_eval=True)
"""=====================================================================================================================
3 对测试集进行预测;将预测结果转换为官方标准格式;并将结果保存至本地
"""
y_test = np.argmax(bst.predict(x_test), axis=1) + 1
df_result = pd.DataFrame(data={'id':range(102277), 'class': y_test.tolist()})
result_path = '../results/' + features_path.split('/')[-1] + '_lgb' + '.csv'
df_result.to_csv(result_path, index=False)
t_end = time.time()
print("训练结束,耗时:{}min".format((t_end - t_start) / 60))