动手入门深度学习笔记-文本预处理以及语言模型

文本预处理


## 读入文本
import collections
import re

def read_time_machine():
    with open('/home/kesci/input/timemachine7163/timemachine.txt', 'r') as f:
        lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f]
    return lines

    lines = read_time_machine()
    print("# sentences %d"%len(lines))

    ## 分词
    def tokenize(sentences,token = 'word'):
        if token == 'word':
            return [sentence.split(' ') for sentence in sentences]
        elif token == 'char':
            return list[sentence for sentence in sentences]
        else:
            print("ERROR:unkown token type"+token)

tokens = tokensize(lines)
tokens[0:2]

## 建立字典
class Vocab():
    def __init__(self,tokens,min_freq =0 ,use_special_tokens = False):
        counter = count_corpus(tokens)
        self.token_freqs = list(counter.item())
        self.idx_to_token = []
        if use_special_tokens:
            # padding,begin of sentence,end of sentence,unknown
            self.pad,self.bos,self.eos.self.unk = (0,1,2,3)
            self.idx_to_token += ['','','']
        else:
            self.unk = 0
            self.idx_to_token += ['']
        self.idx_to_token  += [token for token,freq in self.token_freqs if freq >= min_freq and token not in self.idx_to_token]
        self.token_to_idx = dict()
        for idx,token in enumerate(self.idx_to_token):
            self.token_to_idx[token]  = idx
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_idx.get(tokens,self.unk)
        retirn [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

def count_corpus(sentences):
    tokens = [tk for st in senetences for tk in st]
    return collections.Counter(tokens)

vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:10])

## 将词转为索引
for i in range(8,10):
    print("words:",tokens[i])
    print("indices:",vocab[tokens[i]])

## example 
text = "Mr. Chen doesn't agree with my suggestion."
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
print([token.text for token in doc])

##nltk
from nltk.tokenize import word_tokensize
from nltk import data
data.path.append('/home/kesci/input/nltk_data3784/nltk_data')
print(word_tokenize(text))

语言模型

import torch
import random
## 读取数据集
with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f:
    corpus_chars = f.read()
print(len(corpus_chars))
print(corpus_chars[: 40])
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[: 10000]

## 建立字符索引
idx_to_char = list(set(corpus_chars))
char_to_idx = {char: i for i,char in enumerate(idx_to_char)}
vocab_size = len(char_to_idx)
print(vocab_size)

corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:',''.join([idx_to_char[idx] for idx in sample]))
print('indices',sample)

def load_data_jay_lyrics():
    with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f:
        corpus_chars = f.read()
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[0:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

## 随机采样
def data_iter_random(corpus_indices,batch_size,num_steps,device = None):
    num_examples = (len(corpus_indices) - 1) //  num_steps
    example_indices = [i * num_steps for i in range(num_examples)] # 每个样本的第一个字符在corpus_indices中的下标
    random.shuffle(example_indices)

    def _data(i):
        return corpus_indices[i:i+num_steps]
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(0,num_examples,batch_size):
        batch_indices  = example_indices[i:i+ batch_size]
        X = [_data(j) for j in batch_indices]
        Y = [_data(j + 1) for j in batch_indices]
        yield torch.tensor(X, device=device), torch.tensor(Y, device=device)
my_seq =  list(range(30))
for X,Y in data_iter_random(my_seq,batch_size = 2,num_steps = 6):
    print('X:',X,'\nY:',Y,'\n')

## 相邻采样

def data_iter_consecutive(corpus_indices,batch_size,num_steps,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_avaiable() else 'cpu')
    corpus_len = len(corpus_indices) // batch_size *  batch_size
    corpus_indices = corpus_indices[:corpus_len]
    indices = torch.tensor(corpus_indices,device = device)
    indices  = indices.view(batch_size,-1)
    batch_num = (indices.shape[1] -1) // num_steps
    for i in range(batch_num):
        i = i *  num_steps
        X  = indices[:,i:i+num_steps]
        Y =  indices[:,i+1:i+num_steps + 1]
        yield X,Y

for X,Y in data_iter_consecutive(my_seq,batch_size =2,num_steps = 6):
    print('X: ', X, '\nY:', Y, '\n')

你可能感兴趣的:(动手入门深度学习)