实验用到的数据是当当网书评,举例见下方:
(1)Load data and labels
def load_data_and_labels():
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
positive_examples = list(codecs.open("./data/chinese/pos.txt", "r", "utf-8").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(codecs.open("./data/chinese/neg.txt", "r", "utf-8").readlines())
negative_examples = [s.strip() for s in negative_examples]
# Split by words
x_text = positive_examples + negative_examples
# x_text = [clean_str(sent) for sent in x_text]
x_text = [list(s) for s in x_text]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
return [x_text, y]
这个函数的作用是从文件中加载positive和negative数据,将它们组合在一起,并对每个句子都进行分词,因此x_text是一个二维列表,存储了每个review的每个word;它们对应的labels也组合在一起,由于labels实际对应的是二分类输出层的两个神经元,因此用one-hot编码成0/1和1/0,然后返回y。
def pad_sentences(sentences, padding_word=" "):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
def build_vocab(sentences):
"""
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
# Build vocabulary
word_counts = Counter(itertools.chain(*sentences))
# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
return [vocabulary, vocabulary_inv]
此处,
from collections import Counter
import collections
sentence = ["i", "love", "mom", "mom","mom","me","loves", "me"]
word_counts=collections.Counter(sentence)
print word_counts
print word_counts.most_common()
vocabulary_inv = [x[0] for x in word_counts.most_common()]
print vocabulary_inv
vocabulary_inv = list(sorted(vocabulary_inv))
print vocabulary_inv
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
print vocabulary
print vocabulary_inv
print [vocabulary,vocabulary_inv]
def build_input_data(sentences, labels, vocabulary):
"""
Maps sentencs and labels to vectors based on a vocabulary.
"""
#x present index matrix vocabulary[word] to get index
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
y = np.array(labels)
return [x, y]
由上面两个函数我们得到了所有sentences分词后的二维列表,sentences对应的labels,还有查询每个word对应index的vocabulary字典。
def load_data():
"""
Loads and preprocessed data for the MR dataset.
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
# Load and preprocess data
sentences, labels = load_data_and_labels()
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x, y = build_input_data(sentences_padded, labels, vocabulary)
return [x, y, vocabulary, vocabulary_inv]
def batch_iter(data, batch_size, num_epochs):
"""
Generates a batch iterator for a dataset.
"""
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int(len(data)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]