做中文广告推荐系统,第一步加载中文数据集,这是源代码在github上的链接,有兴趣的可以自己去学习一下。
代码分析:
第一步函数的入口程序: x, y, vocabulary, vocabulary_inv = load_data()
第二步操作是: sentences, labels = load_data_and_labels()
目的是加载数据集 将积极数据和消极数据链接 并加上对应的标签
第三步操作时: sentences_padded = pad_sentences(sentences)
目的是进行词嵌入的时候 保证词嵌入的长度是一致的 对句子进行扩充
第四步操作时: vocabulary, vocabulary_inv = build_vocab(sentences_padded)
单词出现的次数即使这个单词所对应的索引
第五步操作: build_input_data()将数据转化成numpy的形式
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
# 上面的一句话就等价于
x_tmp =[]
for sentence in sentences:
for word in sentence:
x_tmp.append(vocabulary[word])
x_tmp = np.array(x_tmp)
print("**************x_tmp = x************", type(x_tmp), type(x))
print("**************x_tmp = x************",(x_tmp==x))
# -*- coding: utf-8 -*-
# @Time : 2019/2/28 23:56
# @Author : YYLin
# @Email : [email protected]
# @File : chinese-dataload.py
import numpy as np
import re
import itertools
import codecs
from collections import Counter
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_and_labels():
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
positive_examples = list(codecs.open("./data/chinese/pos.txt", "r", "utf-8").readlines())
for i in range(2):
print("********positive_examples***********",positive_examples[i])
positive_examples = [s.strip() for s in positive_examples]
# s.strip()去除数据集中首尾的空格
print("********positive_examples***********", positive_examples[i])
negative_examples = list(codecs.open("./data/chinese/neg.txt", "r", "utf-8").readlines())
negative_examples = [s.strip() for s in negative_examples]
# 直接将数据集进行连接操作
x_text = positive_examples + negative_examples
# 将数据集中的每一个数据 都转化成list数据
print("之前的", x_text[0], type(x_text[0]))
x_text = [list(s) for s in x_text]
print("之后的", x_text[0], type(x_text[0]))
# 积极的数据是[0, 1] 消极的数据是[1, 0] 现在的情况是一段话对应一个标签
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
# print(y[1])
return [x_text, y]
def pad_sentences(sentences, padding_word=" "):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
# 找出一句话中最长的语句作为词嵌入的语句
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
def build_vocab(sentences):
"""
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
# 首先是对每段话中的每个单词进行遍历 求出每个单词出现的次数
word_counts = Counter(itertools.chain(*sentences))
# Mapping from index to word 将单词按照出现的次数进行排队
vocabulary_inv = [x[0] for x in word_counts.most_common()]
print("vocabulary_inv",vocabulary_inv, len(vocabulary_inv))
# Mapping from word to index 就像是换位一样
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
print("vocabulary", vocabulary, len(vocabulary))
return [vocabulary, vocabulary_inv]
def build_input_data(sentences, labels, vocabulary):
"""
Maps sentencs and labels to vectors based on a vocabulary.
"""
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
y = np.array(labels)
return [x, y]
def load_data():
"""
Loads and preprocessed data for the MR dataset.
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
# 经过load_data_and_labels之后的生成的文本中每段话变成了若干个单独的list
sentences, labels = load_data_and_labels()
# 就是对数据进行填充, 确保每段话的长度都是一样的
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
# 将数据转化成numpy 结束数据集的加载
x, y = build_input_data(sentences_padded, labels, vocabulary)
return [x, y, vocabulary, vocabulary_inv]
def batch_iter(data, batch_size, num_epochs):
"""
Generates a batch iterator for a dataset.
"""
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int(len(data)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
x, y, vocabulary, vocabulary_inv = load_data()