# tfidf+xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import xgboost as xgb
!pip install xgboost --user
WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.
Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Looking in indexes: http://yum.tbsite.net/pypi/simple/
Requirement already satisfied: xgboost in /data/nas/workspace/envs/python3.6/site-packages (1.1.1)
Requirement already satisfied: scipy in /opt/conda/lib/python3.6/site-packages (from xgboost) (1.3.3)
Requirement already satisfied: numpy in /opt/conda/lib/python3.6/site-packages (from xgboost) (1.16.0)
train_data = pd.read_csv('./data/train_set.csv',sep='\t')
test_data = pd.read_csv('./data/test_a.csv',sep='\t')
all_text = pd.concat([train_data['text'],test_data['text']],axis = 0)
%%time
#还是用上次的参数,后续可以利用cv调参..
tfidf = TfidfVectorizer(ngram_range=(1,3),max_features = 3000,analyzer='word')
word_vectorizer = tfidf.fit(all_text)
train_word_features = word_vectorizer.transform(train_data['text'])
test_word_features = word_vectorizer.transform(test_data['text'])
CPU times: user 20min 50s, sys: 10.3 s, total: 21min
Wall time: 21min
# 大概看一下训练接中的数据,是max_features列的稀疏矩阵
print(train_word_features.shape,test_word_features.shape)
(200000, 3000) (50000, 3000)
import warnings
warnings.filterwarnings('ignore')
x_trn,x_valid,y_trn,y_valid = train_test_split(train_word_features,train_data['label'],test_size= 0.1,random_state=2020)
%%time
clf = LogisticRegression()
clf.fit(x_trn,y_trn)
y_pred = clf.predict(x_valid)
print('valid_socre',f1_score(y_pred,y_valid,average='macro'))
valid_socre 0.9158974462371058
CPU times: user 5min 19s, sys: 2min 10s, total: 7min 29s
Wall time: 4min 44s
df = pd.DataFrame()
df['label'] = clf.predict(test_word_features)
df.to_csv('./sub_lr.csv',index=False)
线上score:0.9161
LogisticRegression(penalty='l2', dual=False,
tol=0.0001, C=1.0, fit_intercept=True,
intercept_scaling=1, class_weight=None,
random_state=None, solver='warn', max_iter=100,
multi_class='warn', verbose=0,
warm_start=False, n_jobs=None)
penalty
:惩罚项,可为'l1' or 'l2'
。'netton-cg', 'sag', 'lbfgs'
只支持'l2'
。
'l1'
正则化的损失函数不是连续可导的,而'netton-cg', 'sag', 'lbfgs'
这三种算法需要损失函数的一阶或二阶连续可导。- 调参时如果主要是为了解决过拟合,选择
'l2'
正则化就够了。若选择'l2'
正则化还是过拟合,可考虑'l1'
正则化。- 若模型特征非常多,希望一些不重要的特征系数归零,从而让模型系数化的话,可使用
'l1'
正则化。
dual
:选择目标函数为原始形式还是对偶形式。将原始函数等价转化为一个新函数,该新函数称为对偶函数。对偶函数比原始函数更易于优化。
tol
:优化算法停止的条件。当迭代前后的函数差值小于等于tol时就停止。C
:正则化系数。其越小,正则化越强。fit_intercept
:选择逻辑回归模型中是否会有常数项\(b\)。intercept_scaling
:class_weight
:用于标示分类模型中各种类型的权重,{class_label: weight} or 'balanced'
。
'balanced'
:类库根据训练样本量来计算权重。某种类型的样本量越多,则权重越低。- 若误分类代价很高,比如对合法用户和非法用户进行分类,可适当提高非法用户的权重。
- 样本高度失衡的。如合法用户9995条,非法用户5条,可选择
'balanced'
,让类库自动提高非法用户样本的权重。
random_state
:随机数种子。solver
:逻辑回归损失函数的优化方法。
'liblinear'
:使用坐标轴下降法来迭代优化损失函数。'lbfgs'
:拟牛顿法的一种。利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。'newton-cg'
:牛顿法的一种。同上。'sag'
:随机平均梯度下降。每次迭代仅仅用一部分的样本来计算梯度,适合于样本数据多的时候。- 多元逻辑回归有OvR(one-vs-rest)和MvM(many-vs-many)两种,而MvM一般比OvR分类相对准确一些。但是,
'liblinear'
只支持OvR。
max_iter
:优化算法的迭代次数。multi_class
:'ovr' or 'multinomial'
。'multinomial'
即为MvM。
- 若是二元逻辑回归,二者区别不大。
- 对于MvM,若模型有T类,每次在所有的T类样本里面选择两类样本出来,把所有输出为该两类的样本放在一起,进行二元回归,得到模型参数,一共需要T(T-1)/2次分类。
verbose
:控制是否print训练过程。warm_start
:n_jobs
:用cpu的几个核来跑程序。# C:正则化系数,C越小,正则化效果越强
clf2 = LogisticRegression(C=4)
clf2.fit(x_trn,y_trn)
y_pred = clf2.predict(x_valid)
print('the valid_socre of clf2(c=4)',f1_score(y_pred,y_valid,average='macro'))
the valid_socre of clf2(c=4) 0.9217946583149691
param_grid = {'C':range(1,5)}
lgGS = LogisticRegression()
grid = GridSearchCV(lgGS,param_grid,cv = 3,n_jobs=5,scoring='f1_macro')
grid.fit(x_trn,y_trn)
GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=5,
param_grid={'C': range(1, 5)}, scoring='f1_macro')
# 网格搜索训练后的副产品
print("模型的最优参数:",grid.best_params_)
print("最优模型分数:",grid.best_score_)
print("最优模型对象:",grid.best_estimator_)
模型的最优参数: {'C': 4}
最优模型分数: 0.9174856470258348
最优模型对象: LogisticRegression(C=4)
clf3 = grid.best_estimator_
y_pred = clf3.predict(x_valid)
print('the valid_socre of clf3(c=4)',f1_score(y_pred,y_valid,average='macro'))
the valid_socre of clf3(c=4) 0.9217946583149691
df = pd.DataFrame()
df['label'] = clf3.predict(test_word_features)
df.to_csv('./sub_lr_c4.csv',index=False)
线上score:0.9192
def single_model(clf,x_trn,y_trn,x_valid,y_valid,x_test,class_num):
train_matrix = clf.DMatrix(x_trn,label = y_trn)
valid_matrix = clf.DMatrix(x_valid,label = y_valid)
test_matrix = clf.DMatrix(x_test)
params = {'booster': 'gbtree',
'objective':'multi:softmax',
'num_class':class_num,
'min_child_weight': 5,
'max_depth': 8,
'subsample': 0.5,
'colsample_bytree': 0.5,
'eta': 0.001,
'seed': 2020,
'nthread': 36,
'silent': True,
'tree_method':'gpu_hist'
}
watchlist = [(train_matrix,'train'),(valid_matrix,'eval')]
model = clf.train(params,train_matrix,num_boost_round = 50000,evals = watchlist,verbose_eval=500, early_stopping_rounds=1000)
val_pred = model.predict(valid_matrix,ntree_limit = model.best_ntree_limit)
test_pred = model.predict(test_matrix,ntree_limit = model.best_ntree_limit)
return val_pred,test_pred
val_pred,test_pred = single_model(xgb,x_trn,y_trn,x_valid,y_valid,test_word_features,14)
[21:37:22] WARNING: /workspace/src/learner.cc:480:
Parameters: { silent } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-merror:0.98998 eval-merror:0.38230
Multiple eval metrics have been passed: 'eval-merror' will be used for early stopping.
Will train until eval-merror hasn't improved in 1000 rounds.
[500] train-merror:0.98363 eval-merror:0.98440
[1000] train-merror:0.99518 eval-merror:0.99525
Stopping. Best iteration:
[0] train-merror:0.98998 eval-merror:0.38230
val_pred
array([12., 12., 12., ..., 12., 12., 12.], dtype=float32)
print('val_f1score',f1_score(val_pred,y_valid,average='macro'))
val_f1score 0.0026352845214108475
df = pd.DataFrame()
df['label'] = test_pred
df.to_csv('./sub_xgb.csv',index=False)
1.Count Vectors+RidgeClassfier
2.Tfidf+RidgeClassfier
3.Tfidf+lr
4.Tfidf+Xgboost
通过线上验证(天池nlp文本分类学习赛:https://tianchi.aliyun.com/competition/gameList/coupleList)
可知第种算法效果最好,当然后期可以经过更加精细的特征工程和调参上分。
与传统的机器学习不同,深度学习既提供特征提起功能,也可以完成分类的功能。
传统机器学习中几种文本表示方法:
上述方法都或多或少存在一定的问题:转换得到的向量维度很高,需要较长的训练实践;没有考虑单词与单词之间的关系,只是进行了统计。
与这些表示方法不同,深度学习也可以用于文本表示,还可以将其映射到一个低纬空间。其中比较典型的例子有:FastText、Word2Vec和Bert。在本章我们将介绍FastText,将在后面的内容介绍Word2Vec和Bert。
FastText是一种典型的深度学习词向量的表示方法,它非常简单通过Embedding层将单词映射到稠密空间,然后将句子中所有的单词在Embedding空间中进行平均,进而完成分类操作。
所以FastText是一个三层的神经网络,输入层、隐含层和输出层。
import fasttext
model = fasttext.train_unsupervised('data.txt', model='skipgram')
model = fasttext.train_unsupervised('data.txt', model='cbow')
其中data.txt是包含utf-8编码文本的训练文件。
返回的model对象代表您学习的模型,您可以使用它来检索信息。
保存和加载模型对象
您可以通过调用函数来保存训练好的模型对象save_model。
model.save_model("model_filename.bin")
并稍后通过功能来检索它load_model:
model = fasttext.load_model("model_filename.bin")
有关快速文本的单词表示用法的更多信息,您可以参考我们的单词表示教程。
文字分类模型
为了使用此处描述的方法训练文本分类器,我们可以使用如下fasttext.train_supervised函数:
import fasttext
model = fasttext.train_supervised(‘data.train.txt’)
其中data.train.txt是一个文本文件,其中每行包含一个训练语句以及标签。默认情况下,我们假设标签是带有字符串前缀的单词__label__
训练好模型后,我们可以检索单词和标签的列表:
print(model.words)
print(model.labels)
为了通过计算1的精度(P @ 1 )和测试集的召回率来评估我们的模型,我们使用以下test函数:
def print_results(N, p, r):
print("N\t" + str(N))
print("P@{}\t{:.3f}".format(1, p))
print("R@{}\t{:.3f}".format(1, r))
print_results(*model.test(‘test.txt’))
我们还可以预测特定文本的标签:
model.predict("Which baking dish is best to bake a banana bread ?")
默认情况下,predict仅返回一个标签:概率最高的标签。您还可以通过指定参数来预测多个标签k:
model.predict("Which baking dish is best to bake a banana bread ?", k=3)
如果要预测多个句子,可以传递一个字符串数组:
model.predict([“Which baking dish is best to bake a banana bread ?”, “Why not put knives in the dishwasher?”], k=3)
当然,您也可以像单词表示用法中那样将模型保存到文件中或从文件中加载模型。
有关快速文本的文本分类用法的更多信息,您可以参考我们的文本分类教程。
量化压缩模型文件
当您想要保存一个受监督的模型文件时,fastText可以通过压缩一点点性能来压缩它,以使模型文件更小。
model
object, call :model.quantize(input=‘data.train.txt’, retrain=True)
print_results(*model.test(valid_data))
model.save_model("model_filename.ftz")
model_filename.ftz的大小将小于model_filename.bin。
重要说明:预处理数据/编码约定
通常,正确预处理数据非常重要。特别是我们在根文件夹中的示例脚本可以做到这一点。
fastText假定使用UTF-8编码的文本。对于Python2,所有文本都必须是unicode,对于Python3,所有文本都必须是str。传递的文本在传递给fastText C ++库之前,将被pybind11编码为UTF-8。这意味着在构建模型时使用UTF-8编码的文本很重要。在类似Unix的系统上,您可以使用iconv转换文本。
fastText将基于以下ASCII字符(字节)标记化(将文本分割成多个部分)。特别是,它不知道UTF-8空格。我们建议用户适当地将UTF-8空格/单词边界转换为以下符号之一。
空间
标签
垂直标签
回车
换页
空字符
换行符用于分隔文本行。特别是,如果遇到换行符,EOS令牌会附加到一行文本中。唯一的例外是令牌的数量超过字典标题中定义的MAX_LINE_SIZE常量。这意味着,如果您有不被换行符分隔的文本,例如fil9数据集,它将被标记为MAX_LINE_SIZE的标记分成多个块,并且不会附加EOS标记。
import fasttext
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
!pip install fasttext --user
WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.
Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Looking in indexes: http://yum.tbsite.net/pypi/simple/
Requirement already satisfied: fasttext in /data/nas/workspace/envs/python3.6/site-packages (0.9.2)
Requirement already satisfied: numpy in /opt/conda/lib/python3.6/site-packages (from fasttext) (1.16.0)
Requirement already satisfied: pybind11>=2.2 in /opt/conda/lib/python3.6/site-packages (from fasttext) (2.4.3)
Requirement already satisfied: setuptools>=0.7.0 in /opt/conda/lib/python3.6/site-packages (from fasttext) (36.5.0.post20170921)
# generate train_data,valid_data,test_data
train_df = pd.read_csv('./data/train_set.csv',sep='\t')
test_df = pd.read_csv('./data/test_a.csv',sep='\t')
train_df['label_ft'] = '__label__'+train_df['label'].astype(str)
train_df[['text','label_ft']].iloc[:-5000].to_csv('train.csv',index = None,header = None,sep='\t')
input # training file path (required)
lr # learning rate [0.1]
dim # size of word vectors [100]
ws # size of the context window [5]
epoch # number of epochs [5]
minCount # minimal number of word occurences [1]
minCountLabel # minimal number of label occurences [1]
minn # min length of char ngram [0]
maxn # max length of char ngram [0]
neg # number of negatives sampled [5]
wordNgrams # max length of word ngram [1]
loss # loss function {ns, hs, softmax, ova} [softmax]
bucket # number of buckets [2000000]
thread # number of threads [number of cpus]
lrUpdateRate # change the rate of updates for the learning rate [100]
t # sampling threshold [0.0001]
label # label prefix ['__label__']
verbose # verbose [2]
pretrainedVectors # pretrained word vectors (.vec file) for supervised learning []
%%time
model = fasttext.train_supervised('fasttext_train.csv',lr = 1,dim = 128,epoch = 30,wordNgrams = 2,verbose = 2,minCount = 1,loss = 'hs')
CPU times: user 34min 1s, sys: 7.82 s, total: 34min 9s
Wall time: 5min 9s
for lr_ in range(1, 10, 2):
print("Learning rate is:", lr_/10)
for wordGram in [1, 2, 3]:
print("wordNgrams is:", wordGram)
model = fasttext.train_supervised("train.csv", lr=lr_/10, wordNgrams=wordGram, verbose=2, minCount=1, epoch=25, loss="hs")
val_pred = [model.predict(x)[0][0].split("__")[-1] for x in train_df.iloc[-5000:]["text"]]
print("f1_score is:", f1_score(train_df["label"].values[-5000:].astype(str), val_pred, average="macro"))
Learning rate is: 0.1
wordNgrams is: 1
f1_score is: 0.02086129071611565
wordNgrams is: 2
f1_score is: 0.02086129071611565
wordNgrams is: 3
f1_score is: 0.02086129071611565
Learning rate is: 0.3
wordNgrams is: 1
f1_score is: 0.06993473752659386
wordNgrams is: 2
f1_score is: 0.02086129071611565
wordNgrams is: 3
f1_score is: 0.02086129071611565
Learning rate is: 0.5
wordNgrams is: 1
f1_score is: 0.13512015797277438
wordNgrams is: 2
f1_score is: 0.0673949602061529
wordNgrams is: 3
f1_score is: 0.02086129071611565
Learning rate is: 0.7
wordNgrams is: 1
f1_score is: 0.21569654221769838
wordNgrams is: 2
f1_score is: 0.0811164143040247
wordNgrams is: 3
f1_score is: 0.06573324260153178
Learning rate is: 0.9
wordNgrams is: 1
f1_score is: 0.285284616967827
wordNgrams is: 2
f1_score is: 0.13446968595149617
wordNgrams is: 3
f1_score is: 0.07088819390612389