为做深度学习一系列相关实验,需要包括1)基于nltk工具包的去停用词、分词、词性还原,2)利用正则匹配去除多余标点符号,3)利用collection函数统计数据并生成词典,4)随机打乱顺序产生data_batch,5)数据存储模块pickle的使用,6)sklearn包中一些实验评测指标的使用,以及一些初级文本特征提取的方法1)unigram+TF-IDF,2)word2vec,3)glove特征等。
from nltk.tokenize import WordPunctTokenizer, sent_tokenize
text = 'I love China'
tokenizer = WordPunctTokenizer()
split_text = tokenizer.tokenize(text)
# 先分句后分词
sent = 'The store has only a few seats as it rather encourages customers to pre-order via a mobile app and come to the store just for the pick up. The new model will also benefit the coffee shop’s delivery service.’
split_sent = sent_tokenize(sent)
text = []
for sent in split_sent:
split_text = tokenizer.tokenize(sent)
text.append(split_text)
from nltk.stem import WordNetLemmatizer
def word_lemmatizer(text):
text_lemmatized = []
lemmatizer = WordNetLemmatizer()
for word in text:
text_lemmatized.append(lemmatizer.lemmatize(word))
return text_lemmatized
re模块提供了re.sub函数用于检索和替换字符串中的匹配项
re.sub(pattern, repl, string, count=0, flags=0)
其中,pattern为正则中的模式字符串,repl为要被替换的字符串或函数,string为要被检索替换的原始字符串,count为模式匹配后替换的最大次数,默认为0表示替换所有匹配。
def clean_str(string):
"""
Tokenization/string cleaning
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r",", " ", string)
string = re.sub(r"!", " ", string)
string = re.sub(r"\(", " ", string)
string = re.sub(r"\)", " ", string)
string = re.sub(r"\?", " ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
import collections
count = collections.Counter(x_data)
x_count = sorted(count.items(), key=lambda x: -x[1])
words, _ = zip(x_count)
# 取top k词频最高的词纳入词典
words = words[:1000]
word_to_index = {'PAD': 0, 'UNK': 1}
x_dict = dict(zip(words, range(2, len(words) + 2)))
import pickle
#注意文件读取或写入方式均为二进制
pickle.load(p_file)
pickle.dump(obj, p_file, [,protocol])
pickle模块有时在生成较大数据文件时会产生Memory Error,因此可以选用另一种存储方式
from sklearn.externals import joblib
# 文件可以为.pkl后缀
joblib.load(file_path)
joblib.dump(obj, file_path)
def generate_batch(x_data, y_data, batch_size):
print('generating batches')
x_batches = []
y_batches = []
p = list(range(len(x_data)))
random.shuffle(p)
for i in range(int(len(y_data) / batch_size)):
x_batch = []
y_batch = []
o_y_batch = []
sent_batch = []
for j in range(batch_size):
pos = i * batch_size + j
x_batch.append(x_data[p[pos]])
y_batch.append(y_data[p[pos]])
x_batches.append(x_batch)
y_batches.append(y_batch)
return x_batches, y_batches
# 另一种在生成batch前就打乱数据顺序的方法
import random
rand = random.randint(0, 100)
random.seed(rand)
random.shuffle(x_data)
random.seed(rand)
random,shuffle(y_data)
import sklearn.metrics as metrics
acc = metrics.accuracy_score(y_data, y_pred)
print("accuracy {:g}".format(acc))
# calculate precision
prec = metrics.precision_score(y_data, y_pred)
print("precision {:g}".format(prec))
# calculate recall
recal = metrics.recall_score(y_data, y_pred, average="macro")
print("recall {:g}".format(recal))
# calculate F1
F1 = metrics.f1_score(y_data, y_pred, average="macro")
print("F1 {:g}".format(F1))
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
def unigram_process(data):
vectorizer = CountVectorizer()
vector = vectorizer.fit(data)
return vector
def tfidf_process(data):
transformer = TfidfTransformer()
transform = transformer.fit(data)
return transform
x = '文本信息'
uni_vectorizer = unigram_process(x)
x_uni = uni_vectorizer.transform(x)
tf_uni_vectorizer = tfidf_process(x_uni)
x_tf_uni = tf_uni_vectorizer.transform(x_uni)
首先下载glove特征,
http://nlp.stanford.edu/data/glove.6B.zip
根据glove特征与自己数据构建的词典(见上部分)创建glove特征嵌入词典
def load_glove(glove_file, emb_size, vocab):
print('Loading Glove pre-trained word embeddings ...')
embedding_weights = {}
f = open(glove_file, encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], dtype='float32')
embedding_weights[word] = vector
f.close()
print('Total {} word vectors in {}'.format(len(embedding_weights), glove_file))
embedding_matrix = np.random.uniform(-0.5, 0.5, (len(vocab), emb_size)) / emb_size
oov_count = 0
for word, i in vocab.items():
embedding_vector = embedding_weights.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
else:
oov_count += 1
print('Number of OOV words = %d' % oov_count)
return embedding_matrix
利用glove嵌入词典作为word_embedding模型的initializer
self.embedding_matrix = tf.get_variable(name='embedding_matrix',
shape=[self.vocab_size, self.emb_size],
# self.pretrained_embs = glove_embed
initializer=tf.constant_initializer(self.pretrained_embs),
dtype=tf.float32)