CNN for Chinese Text Classification in Tensorflow(中文文本分类)

做中文广告推荐系统,第一步加载中文数据集,这是源代码在github上的链接,有兴趣的可以自己去学习一下。

 

代码分析:

第一步函数的入口程序:        x, y, vocabulary, vocabulary_inv  =   load_data()

第二步操作是:                      sentences, labels  =  load_data_and_labels()   

                                               目的是加载数据集 将积极数据和消极数据链接 并加上对应的标签

 

 

第三步操作时:                sentences_padded = pad_sentences(sentences)

                                        目的是进行词嵌入的时候  保证词嵌入的长度是一致的  对句子进行扩充

 

 

第四步操作时:               vocabulary, vocabulary_inv = build_vocab(sentences_padded)

                                        单词出现的次数即使这个单词所对应的索引

 

第五步操作:                   build_input_data()将数据转化成numpy的形式

 x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
 

 # 上面的一句话就等价于
  x_tmp =[]
  for sentence in sentences:
    for word in sentence:
      x_tmp.append(vocabulary[word])
  x_tmp = np.array(x_tmp)
  print("**************x_tmp = x************", type(x_tmp), type(x))
  print("**************x_tmp = x************",(x_tmp==x))

 

# -*- coding: utf-8 -*-
# @Time    : 2019/2/28 23:56
# @Author  : YYLin
# @Email   : [email protected]
# @File    : chinese-dataload.py
import numpy as np
import re
import itertools
import codecs
from collections import Counter


def clean_str(string):
  """
  Tokenization/string cleaning for all datasets except for SST.
  Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
  """
  string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
  string = re.sub(r"\'s", " \'s", string)
  string = re.sub(r"\'ve", " \'ve", string)
  string = re.sub(r"n\'t", " n\'t", string)
  string = re.sub(r"\'re", " \'re", string)
  string = re.sub(r"\'d", " \'d", string)
  string = re.sub(r"\'ll", " \'ll", string)
  string = re.sub(r",", " , ", string)
  string = re.sub(r"!", " ! ", string)
  string = re.sub(r"\(", " \( ", string)
  string = re.sub(r"\)", " \) ", string)
  string = re.sub(r"\?", " \? ", string)
  string = re.sub(r"\s{2,}", " ", string)
  return string.strip().lower()


def load_data_and_labels():
  """
  Loads MR polarity data from files, splits the data into words and generates labels.
  Returns split sentences and labels.
  """
  # Load data from files
  positive_examples = list(codecs.open("./data/chinese/pos.txt", "r", "utf-8").readlines())
  for i in range(2):
      print("********positive_examples***********",positive_examples[i])
  positive_examples = [s.strip() for s in positive_examples]
  # s.strip()去除数据集中首尾的空格
  print("********positive_examples***********", positive_examples[i])

  negative_examples = list(codecs.open("./data/chinese/neg.txt", "r", "utf-8").readlines())
  negative_examples = [s.strip() for s in negative_examples]

  # 直接将数据集进行连接操作
  x_text = positive_examples + negative_examples

  # 将数据集中的每一个数据 都转化成list数据
  print("之前的", x_text[0], type(x_text[0]))
  x_text = [list(s) for s in x_text]
  print("之后的", x_text[0], type(x_text[0]))

  # 积极的数据是[0, 1] 消极的数据是[1, 0] 现在的情况是一段话对应一个标签
  positive_labels = [[0, 1] for _ in positive_examples]
  negative_labels = [[1, 0] for _ in negative_examples]
  y = np.concatenate([positive_labels, negative_labels], 0)
  # print(y[1])
  return [x_text, y]


def pad_sentences(sentences, padding_word=""):
  """
  Pads all sentences to the same length. The length is defined by the longest sentence.
  Returns padded sentences.
  """
  # 找出一句话中最长的语句作为词嵌入的语句
  sequence_length = max(len(x) for x in sentences)

  padded_sentences = []
  for i in range(len(sentences)):
    sentence = sentences[i]
    num_padding = sequence_length - len(sentence)
    new_sentence = sentence + [padding_word] * num_padding
    padded_sentences.append(new_sentence)
  return padded_sentences


def build_vocab(sentences):
  """
  Builds a vocabulary mapping from word to index based on the sentences.
  Returns vocabulary mapping and inverse vocabulary mapping.
  """
  # 首先是对每段话中的每个单词进行遍历 求出每个单词出现的次数
  word_counts = Counter(itertools.chain(*sentences))
  # Mapping from index to word 将单词按照出现的次数进行排队
  vocabulary_inv = [x[0] for x in word_counts.most_common()]
  print("vocabulary_inv",vocabulary_inv, len(vocabulary_inv))

  # Mapping from word to index 就像是换位一样
  vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
  print("vocabulary", vocabulary, len(vocabulary))
  return [vocabulary, vocabulary_inv]


def build_input_data(sentences, labels, vocabulary):
  """
  Maps sentencs and labels to vectors based on a vocabulary.
  """
  x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
  y = np.array(labels)
  return [x, y]


def load_data():
  """
  Loads and preprocessed data for the MR dataset.
  Returns input vectors, labels, vocabulary, and inverse vocabulary.
  """
  # 经过load_data_and_labels之后的生成的文本中每段话变成了若干个单独的list
  sentences, labels = load_data_and_labels()

  # 就是对数据进行填充, 确保每段话的长度都是一样的
  sentences_padded = pad_sentences(sentences)

  vocabulary, vocabulary_inv = build_vocab(sentences_padded)

  # 将数据转化成numpy 结束数据集的加载
  x, y = build_input_data(sentences_padded, labels, vocabulary)
  return [x, y, vocabulary, vocabulary_inv]


def batch_iter(data, batch_size, num_epochs):
  """
  Generates a batch iterator for a dataset.
  """
  data = np.array(data)
  data_size = len(data)
  num_batches_per_epoch = int(len(data)/batch_size) + 1
  for epoch in range(num_epochs):
    # Shuffle the data at each epoch
    shuffle_indices = np.random.permutation(np.arange(data_size))
    shuffled_data = data[shuffle_indices]
    for batch_num in range(num_batches_per_epoch):
      start_index = batch_num * batch_size
      end_index = min((batch_num + 1) * batch_size, data_size)
      yield shuffled_data[start_index:end_index]


x, y, vocabulary, vocabulary_inv = load_data()

 

你可能感兴趣的:(NLP自然语言处理,NLP)