代码记录(中文NLP任务)

预处理代码

分词

输入一句话
输出一个词的list

from pyltp import Segmentor
from zhon.hanzi import punctuation
def word_cut(sentence):
    LTP_DATA_DIR = 'C:\\Users\\d84105613\\ltp_data'
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path, 'lexicon')  # 加载模型
    words = segmentor.segment(sentence)
    segmentor.release()
    words = list(words)
    #print(len(set(words)))
    words = [c for c in words if c not in punctuation]
    return words

加载词向量

输入:第一行是voc_size and emb_size
下面是词向量
输出:词表,词向量

def loadWord2Vec(filename):
    vocab = []
    embd = []
    cnt = 0
    fr = open(filename,'r',encoding='utf-8')
    line = fr.readline().strip()
    #print line
    word_dim = int(line.split(' ')[1])
    vocab.append("unk")
    #print(word_dim)
    embd.append([0]*word_dim)
    for line in fr:
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print("loaded word2vec")
    embd = np.asarray(embd).astype(np.float)
    fr.close()
    return vocab, embd

读取文本数据

输入:txt文件或者其他类型文本,sentence1 /t sentence2 /t label
输出:sentence1 list sentence2 list

def getTsvData(filepath):
    # 读取输入的文件,文件分为三行,
    # 行与行之间用\t分隔开,
    # 前两行为需要计算相似度的两个句子,
    # 后两行为类别标记
    print("Loading training data from " + filepath)
    x1 = []
    x2 = []
    y = []
    # positive samples from file
    for line in open(filepath,encoding='utf-8'):
        l = line.strip().split("\t")
        if len(l) < 2:
            continue
        # 随机打乱两个句子之间的位置关系
        if random() > 0.5:
            x1.append(l[0])
            x2.append(l[1])
        else:
            x1.append(l[1])
            x2.append(l[0])
        y.append(int(l[2]))
    return np.asarray(x1), np.asarray(x2), np.asarray(y)

一个完整的使用tf转换句子成词id的实例

from random import random
import numpy as np
import os
from pyltp import Segmentor
from zhon.hanzi import punctuation
from tensorflow.contrib import learn

LTP_DATA_DIR = 'C:\\Users\\d84105613\\ltp_data'
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
segmentor = Segmentor()  # 初始化实例
segmentor.load_with_lexicon(cws_model_path, 'lexicon')  # 加载模型
def getTsvData(filepath):
    # 读取输入的文件,文件分为三行,
    # 行与行之间用\t分隔开,
    # 前两行为需要计算相似度的两个句子,
    # 后两行为类别标记
    print("Loading training data from " + filepath)
    x1 = []
    x2 = []
    y = []
    # positive samples from file
    for line in open(filepath,encoding='utf-8'):
        l = line.strip().split("\t")
        if len(l) < 2:
            continue
        # 随机打乱两个句子之间的位置关系
        if random() > 0.5:
            x1.append(l[0])
            x2.append(l[1])
        else:
            x1.append(l[1])
            x2.append(l[0])
        y.append(int(l[2]))
    return np.asarray(x1), np.asarray(x2), np.asarray(y)


def word_cut(sentences):

    for sentence in sentences:
        words = segmentor.segment(sentence)
        #segmentor.release()
        words = list(words)
        # print(len(set(words)))
        words = [c for c in words if c not in punctuation]
        yield words

if __name__ == '__main__':
    x1_text, x2_text, y = getTsvData("atec_nlp_sim_data.txt")
    vocab = learn.preprocessing.VocabularyProcessor(max_document_length=15,min_frequency=2,tokenizer_fn=word_cut)
    a = vocab.fit_transform(np.concatenate((x2_text, x1_text), axis=0))
    print(list(a))
    #可以看到转换后的word_id
    doc = list(a)
    for i in vocab.reverse(doc):
        print(i)
    #可以看到原始的分词句子,不齐的它自动补齐

使用训练好的词向量去生成embedding层

initW = np.random.uniform(-0.25, 0.25, (len(vocab_processor.vocabulary_), FLAGS.embedding_dim))
    # initW = np.zeros(shape=(len(vocab_processor.vocabulary_), FLAGS.embedding_dim))
    # load any vectors from the word2vec
    print("initializing initW with pre-trained word2vec embeddings")
    for w in vocab_processor.vocabulary_._mapping:
        arr = []
        if w in inpH.pre_emb:
            arr = inpH.pre_emb[w]
            if len(arr) > 0:
                idx = vocab_processor.vocabulary_.get(w)
                initW[idx] = np.asarray(arr).astype(np.float32)
    print("Done assigning intiW. len=" + str(len(initW)))

你可能感兴趣的:(编码,NLP,文本处理)