用python做数据预处理那些事儿

为做深度学习一系列相关实验,需要包括1)基于nltk工具包的去停用词、分词、词性还原,2)利用正则匹配去除多余标点符号,3)利用collection函数统计数据并生成词典,4)随机打乱顺序产生data_batch,5)数据存储模块pickle的使用,6)sklearn包中一些实验评测指标的使用,以及一些初级文本特征提取的方法1)unigram+TF-IDF,2)word2vec,3)glove特征等。

nltk工具包

分词与分句
from nltk.tokenize import WordPunctTokenizer, sent_tokenize
text = 'I love China'
tokenizer = WordPunctTokenizer()
split_text = tokenizer.tokenize(text)
# 先分句后分词
sent = 'The store has only a few seats as it rather encourages customers to pre-order via a mobile app and come to the store just for the pick up. The new model will also benefit the coffee shop’s delivery service.’
split_sent = sent_tokenize(sent)
text = []
for sent in split_sent:
    split_text = tokenizer.tokenize(sent)
    text.append(split_text)
词性还原
from nltk.stem import WordNetLemmatizer
def word_lemmatizer(text):
    text_lemmatized = []
    lemmatizer = WordNetLemmatizer()
    for word in text:
        text_lemmatized.append(lemmatizer.lemmatize(word))
    return text_lemmatized
去停止词

Re正则模块

re模块提供了re.sub函数用于检索和替换字符串中的匹配项

re.sub(pattern, repl, string, count=0, flags=0)

其中,pattern为正则中的模式字符串,repl为要被替换的字符串或函数,string为要被检索替换的原始字符串,count为模式匹配后替换的最大次数,默认为0表示替换所有匹配。

def clean_str(string):
    """
    Tokenization/string cleaning
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

利用collection生成词典

import collections
count = collections.Counter(x_data)
x_count = sorted(count.items(), key=lambda x: -x[1])
words, _ = zip(x_count)
# 取top k词频最高的词纳入词典
words = words[:1000]
word_to_index = {'PAD': 0, 'UNK': 1}
x_dict = dict(zip(words, range(2, len(words) + 2)))

数据存储模块

import pickle
#注意文件读取或写入方式均为二进制
pickle.load(p_file)
pickle.dump(obj, p_file, [,protocol])

pickle模块有时在生成较大数据文件时会产生Memory Error,因此可以选用另一种存储方式

from sklearn.externals import joblib
# 文件可以为.pkl后缀
joblib.load(file_path)
joblib.dump(obj, file_path)

产生data batch的函数

def generate_batch(x_data, y_data, batch_size):
    print('generating batches')
    x_batches = []
    y_batches = []
    p = list(range(len(x_data)))
    random.shuffle(p)
    for i in range(int(len(y_data) / batch_size)):
        x_batch = []
        y_batch = []
        o_y_batch = []
        sent_batch = []
        for j in range(batch_size):
            pos = i * batch_size + j
            x_batch.append(x_data[p[pos]])
            y_batch.append(y_data[p[pos]])
        x_batches.append(x_batch)
        y_batches.append(y_batch)
        
    return x_batches, y_batches
# 另一种在生成batch前就打乱数据顺序的方法
import random
rand = random.randint(0, 100)
random.seed(rand)
random.shuffle(x_data)
random.seed(rand)
random,shuffle(y_data)

实验数据评测指标

import sklearn.metrics as metrics
acc = metrics.accuracy_score(y_data, y_pred)
print("accuracy {:g}".format(acc))
# calculate precision
prec = metrics.precision_score(y_data, y_pred)
print("precision {:g}".format(prec))
# calculate recall
recal = metrics.recall_score(y_data, y_pred, average="macro")
print("recall {:g}".format(recal))
# calculate F1
F1 = metrics.f1_score(y_data, y_pred, average="macro")
print("F1 {:g}".format(F1))

初级文本特征提取方法

unigram+TF-IDF

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
def unigram_process(data):
    vectorizer = CountVectorizer()
    vector = vectorizer.fit(data)
    return vector

def tfidf_process(data):
    transformer = TfidfTransformer()
    transform = transformer.fit(data)
    return transform
x = '文本信息'
uni_vectorizer = unigram_process(x)
x_uni = uni_vectorizer.transform(x)
tf_uni_vectorizer = tfidf_process(x_uni)
x_tf_uni = tf_uni_vectorizer.transform(x_uni)

glove特征

首先下载glove特征,
http://nlp.stanford.edu/data/glove.6B.zip
根据glove特征与自己数据构建的词典(见上部分)创建glove特征嵌入词典

def load_glove(glove_file, emb_size, vocab):
  print('Loading Glove pre-trained word embeddings ...')
  embedding_weights = {}
  f = open(glove_file, encoding='utf-8')
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embedding_weights[word] = vector
  f.close()
  print('Total {} word vectors in {}'.format(len(embedding_weights), glove_file))

  embedding_matrix = np.random.uniform(-0.5, 0.5, (len(vocab), emb_size)) / emb_size

  oov_count = 0
  for word, i in vocab.items():
    embedding_vector = embedding_weights.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector
    else:
      oov_count += 1
  print('Number of OOV words = %d' % oov_count)

  return embedding_matrix

利用glove嵌入词典作为word_embedding模型的initializer

  self.embedding_matrix = tf.get_variable(name='embedding_matrix',
                                          shape=[self.vocab_size, self.emb_size],
                                          # self.pretrained_embs = glove_embed
                                          initializer=tf.constant_initializer(self.pretrained_embs),
                                          dtype=tf.float32)

你可能感兴趣的:(机器学习)