数据集下载:链接:https://pan.baidu.com/s/1zxrKtTYli2iQgK1iNVP9PQ 提取码:la3w
实际上fasttext和cbow模型类似,就是一个简单的神经网络。
N-gram subwords
以独立单词进行训练学习,将会产生一系列问题。
低频词、罕见词得不到足够的训练,效果不佳;
未登录词(没有出现过的词,或者某些拼写错误的词)传统模型更加无能为力。
将一个单词打散到字符级别,并且利用字符级别的n-gram信息来捕捉字符间的顺序关系
原理:西方语言文字常常通过前缀、后缀、字根来构词,汉语也有单字表义的传统
Google:为了表达单词前后边界,我们加入<>两个字符,即变形为“”。假设我们希望抽取所有的tri-gram信息,可以得到如下集合:G = {
在训练过程中,每个n-gram都会对应训练一个向量,而原来完整单词的词向量就由它对应的所有n-gram的向量求和得到。所有的单词向量以及字符级别的n-gram向量会同时相加求平均作为训练模型的输入。
从实验效果来看,subword n-gram信息的加入,不但解决了低频词未登录词的表达的问题,而且对于最终任务精度一般会有几个百分点的提升。唯一的问题就是由于需要估计的参数多,模型可能会比较膨胀。
import os
import re
import jieba
import pandas as pd
import tensorflow as tf
import numpy as np
tf.__version__
root='data/百度题库/高中_历史/origin'
ancient_his_df=pd.read_csv(os.path.join(root,'古代史.csv'))
contemporary_his_df=pd.read_csv(os.path.join(root,'现代史.csv'))
modern_his_df=pd.read_csv(os.path.join(root,'近代史.csv'))
ancient_his_df['label']='__label__古代史'
contemporary_his_df['label']='__label__现代史'
modern_his_df['label']='__label__近代史'
def load_stop_words(stop_word_path):
file=open(stop_word_path,'r',encoding='utf-8')
stop_words=file.readlines()
stop_words=[stop_word.strip() fo
stopwords_path='data/stopwords/哈工大停用词表.txt'
def clean_sentence(line):
line = re.sub(
"[a-zA-Z0-9]|[\s+\-\|\!\/\[\]\{\}_,.$%^*(+\"\')]+|[::+——()?【】《》“”!,。?、~@#¥%……&*()]+|题目", '',line)
tokens = jieba.cut(line, cut_all=False)
return tokens
stop_words=load_stop_words(stopwords_path)
def sentence_proc(sentence):
words=clean_sentence(sentence)
words=[word for word in words if word not in stop_words]
return ' '.join(words)
ancient_his_df['item']=ancient_his_df['item'].apply(sentence_proc)
contemporary_his_df['item']=contemporary_his_df['item'].apply(sentence_proc)
modern_his_df['item']=modern_his_df['item'].apply(sentence_proc)
dataset_df=pd.concat([ancient_his_df,contemporary_his_df,modern_his_df])
#fasttext部分
max_features=20000
class_num=3
ngram_range=2
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import OneHotEncoder,LabelBinarizer
from tensorflow.keras import Input,Model
from tensorflow.keras.layers import Embedding,GlobalAveragePooling1D,Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
corpus=dataset_df['item']
text_preprocesser=preprocessing.text.Tokenizer(num_words=max_features,oov_token='' )
text_preprocesser.fit_on_texts(corpus)
x=text_preprocesser.texts_to_sequences(corpus)
word_dict=text_preprocesser.word_index
#保存字典
with open('data/vocab.txt','w',encoding='UTF8') as f:
for k,v in word_dict.items():
f.write(f'{k}\t{str(v)}\n')
lb=LabelBinarizer()
lb.fit(dataset_df['label'])
print(lb.classes_)
y=lb.transform(dataset_df['label'])#转换成one-hot编码
#划分训练集测试集
X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=42)
#下面两个函数是实现添加ngram特征的,但是需要进行筛选要不然得到的结果会不好(因为有的组合词频很低,不利于分类)
def create_ngram_set(input_list,ngram_value=2):
return set(zip(*[input_list[i:] for i in range(ngram_value)]))
def add_ngram(sequences,token_indice,ngram_range=2):
new_sequences=[]
for input_list in sequences:
new_list=input_list[:]
for ngram_value in range(2,ngram_range+1):
for i in range(len(new_list)-ngram_value+1):
ngram=tuple(new_list[i:i+ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list)
return np.array(new_sequences)
if ngram_range>1:
print('adding{}-gram features'.format(ngram_range))
ngram_set=set()
for input_list in X_train:
for i in range(2,ngram_range+1):
set_of_ngram=create_ngram_set(input_list,ngram_value=i)
ngram_set.update(set_of_ngram)
start_index=max_features+1
token_indice={v:k+start_index for k,v in enumerate(ngram_set)}
indice_token={token_indice[k]:k for k in token_indice}
max_features=np.max(list(indice_token.keys()))+1
X_train=add_ngram(X_train,token_indice,ngram_range)
X_test=add_ngram(X_test,token_indice,ngram_range)
print('average train sequence length:{}'.format(np.mean(list(map(len,X_train)),dtype=int)))
print('average test sequence length:{}'.format(np.mean(list(map(len,X_test)),dtype=int)))
maxlen=int(np.ceil(max(np.mean(list(map(len,X_train))),np.mean(list(map(len,X_test))))))
print('averagelength:{}'.format(maxlen))
#average train sequence length:256
#average test sequence length:220
#averagelength:257
X_train=sequence.pad_sequences(X_train,maxlen=maxlen,padding='post',truncating='post')
X_test=sequence.pad_sequences(X_test,maxlen=maxlen,padding='post',truncating='post')
print('x_train shape:',X_train.shape)
print('x_test shape:',X_test.shape)
#x_train shape: (3976, 257)
#x_test shape: (994, 257)
#fasttext
#1.第一层是embedding (batch sequence embedding)
#2。全局平均池化1D
#3.最后一层与单个节点紧密连接,使用sigmoid激活函数
#构建模型
class FastText(object):
def __init__(self,maxlen,max_features,embedding_dims,class_num=1,
last_activation='sigmoid'):
self.maxlen=maxlen
self.max_features=max_features
self.embedding_dims=embedding_dims
self.class_num=class_num
self.last_activation=last_activation
def get_model(self):
input=Input((self.maxlen,))
embedding=Embedding(self.max_features,self.embedding_dims,input_length=self.maxlen)(input)
x=GlobalAveragePooling1D()(embedding)
output=Dense(self.class_num,activation=self.last_activation)(x)
model=Model(inputs=input,outputs=output)
return model
#评估
batch_size=128
embedding_dims=300
epochs=20
model=FastText(maxlen,max_features,embedding_dims,class_num).get_model()
model.compile('adam','binary_crossentropy',metrics=['accuracy'])
model.summary()
print('Train...')
early_stopping=EarlyStopping(monitor='val_accuracy',patience=5,mode='max')
history=model.fit(X_train,y_train,
batch_size=batch_size,
epochs=epochs,
workers=32,
use_multiprocessing=True,
callbacks=[early_stopping],
validation_data=(X_test,y_test))
from sklearn.metrics import classification_report,multilabel_confusion_matrix,confusion_matrix
results=model.evaluate(X_test,y_test,verbose=2)
print(results)
#994/1 - 4s - loss: 0.2895 - accuracy: 0.8115
[0.3930790154506983, 0.8115359]
import matplotlib.pyplot as plt
history_dict=history.history
history_dict.keys()
acc=history_dict['accuracy']
val_acc=history_dict['val_accuracy']
loss=history_dict['loss']
val_loss=history_dict['val_loss']
epochs=range(1,len(acc)+1)
plt.plot(epochs,loss,'bo',label='Train_loss')
plt.plot(epochs,val_loss,'b',label='Validation loss')
plt.show()
print('test...')
y_pred=model.predict(X_test)
y_pred=y_pred.argmax(axis=1)
y_true=y_test.argmax(axis=1)
print(classification_report(y_true,y_pred))
confusion_matrix(y_true,y_pred)
#test...
# precision recall f1-score support
#
# 0 0.91 0.84 0.87 213
# 1 0.67 0.75 0.71 451
# 2 0.65 0.58 0.61 330
#
# accuracy 0.71 994
# macro avg 0.74 0.72 0.73 994
#weighted avg 0.72 0.71 0.71 994
#下图为模型结构和迭代优化曲线