文本预处理
import collections
import re
def read_time_machine():
with open('/home/kesci/input/timemachine7163/timemachine.txt', 'r') as f:
lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f]
return lines
lines = read_time_machine()
print("# sentences %d"%len(lines))
def tokenize(sentences,token = 'word'):
if token == 'word':
return [sentence.split(' ') for sentence in sentences]
elif token == 'char':
return list[sentence for sentence in sentences]
else:
print("ERROR:unkown token type"+token)
tokens = tokensize(lines)
tokens[0:2]
class Vocab():
def __init__(self,tokens,min_freq =0 ,use_special_tokens = False):
counter = count_corpus(tokens)
self.token_freqs = list(counter.item())
self.idx_to_token = []
if use_special_tokens:
self.pad,self.bos,self.eos.self.unk = (0,1,2,3)
self.idx_to_token += ['','','']
else:
self.unk = 0
self.idx_to_token += ['']
self.idx_to_token += [token for token,freq in self.token_freqs if freq >= min_freq and token not in self.idx_to_token]
self.token_to_idx = dict()
for idx,token in enumerate(self.idx_to_token):
self.token_to_idx[token] = idx
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self,tokens):
if not isinstance(tokens,(list,tuple)):
return self.token_to_idx.get(tokens,self.unk)
retirn [self.__getitem__(token) for token in tokens]
def to_tokens(self,indices):
if not isinstance(indices,(list,tuple)):
return self.idx_to_token[indices]
return [self.idx_to_token[index] for index in indices]
def count_corpus(sentences):
tokens = [tk for st in senetences for tk in st]
return collections.Counter(tokens)
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:10])
for i in range(8,10):
print("words:",tokens[i])
print("indices:",vocab[tokens[i]])
text = "Mr. Chen doesn't agree with my suggestion."
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
print([token.text for token in doc])
from nltk.tokenize import word_tokensize
from nltk import data
data.path.append('/home/kesci/input/nltk_data3784/nltk_data')
print(word_tokenize(text))
语言模型
import torch
import random
with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f:
corpus_chars = f.read()
print(len(corpus_chars))
print(corpus_chars[: 40])
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[: 10000]
idx_to_char = list(set(corpus_chars))
char_to_idx = {char: i for i,char in enumerate(idx_to_char)}
vocab_size = len(char_to_idx)
print(vocab_size)
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:',''.join([idx_to_char[idx] for idx in sample]))
print('indices',sample)
def load_data_jay_lyrics():
with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f:
corpus_chars = f.read()
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[0:10000]
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
corpus_indices = [char_to_idx[char] for char in corpus_chars]
return corpus_indices, char_to_idx, idx_to_char, vocab_size
def data_iter_random(corpus_indices,batch_size,num_steps,device = None):
num_examples = (len(corpus_indices) - 1) // num_steps
example_indices = [i * num_steps for i in range(num_examples)]
random.shuffle(example_indices)
def _data(i):
return corpus_indices[i:i+num_steps]
if device is None:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for i in range(0,num_examples,batch_size):
batch_indices = example_indices[i:i+ batch_size]
X = [_data(j) for j in batch_indices]
Y = [_data(j + 1) for j in batch_indices]
yield torch.tensor(X, device=device), torch.tensor(Y, device=device)
my_seq = list(range(30))
for X,Y in data_iter_random(my_seq,batch_size = 2,num_steps = 6):
print('X:',X,'\nY:',Y,'\n')
def data_iter_consecutive(corpus_indices,batch_size,num_steps,device=None):
if device is None:
device = torch.device('cuda' if torch.cuda.is_avaiable() else 'cpu')
corpus_len = len(corpus_indices) // batch_size * batch_size
corpus_indices = corpus_indices[:corpus_len]
indices = torch.tensor(corpus_indices,device = device)
indices = indices.view(batch_size,-1)
batch_num = (indices.shape[1] -1) // num_steps
for i in range(batch_num):
i = i * num_steps
X = indices[:,i:i+num_steps]
Y = indices[:,i+1:i+num_steps + 1]
yield X,Y
for X,Y in data_iter_consecutive(my_seq,batch_size =2,num_steps = 6):
print('X: ', X, '\nY:', Y, '\n')