数据预处理:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))
import nltk
nltk.download('stopwords')
打印结果:
TensorFlow Version: 1.8.0 [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Administrator\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
reviews = pd.read_csv("Reviews.csv")
reviews=reviews.iloc[0:10000,:]
# Remove null values and unneeded features
reviews = reviews.dropna()
reviews = reviews.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator',
'Score','Time'], 1)
reviews = reviews.reset_index(drop=True)
reviews.head()
打印结果:
# Inspecting some of the reviews
for i in range(2):
print("Review #",i+1)
print(reviews.Summary[i])
print(reviews.Text[i])
print()
打印结果:
Review # 1 Good Quality Dog Food I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most. Review # 2 Not as Advertised Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}
def clean_text(text, remove_stopwords = True):
'''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
# Convert words to lower case
text = text.lower()
# Replace contractions with their longer forms
if True:
text = text.split()
new_text = []
for word in text:
if word in contractions:
new_text.append(contractions[word])
else:
new_text.append(word)
text = " ".join(new_text)
# Format words and remove unwanted characters
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub(r'\', ' ', text)
text = re.sub(r'\'', ' ', text)
# Optionally, remove stop words
if remove_stopwords:
text = text.split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
return text
# Clean the summaries and texts
clean_summaries = []
for summary in reviews.Summary:
clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Summaries are complete.")
clean_texts = []
for text in reviews.Text:
clean_texts.append(clean_text(text))
print("Texts are complete.")
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(2):
print("Clean Review #",i+1)
print(clean_summaries[i])
print(clean_texts[i])
print()
打印结果:
Clean Review # 1 good quality dog food bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better Clean Review # 2 not as advertised product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted sure error vendor intended represent product jumbo
def count_words(count_dict, text):
for sentence in text:
for word in sentence.split():#统计单词在文本中出现的次数
if word not in count_dict:
count_dict[word] = 1
else:
count_dict[word] += 1
# Find the number of times each word was used and the size of the vocabulary
word_counts = {}
count_words(word_counts, clean_summaries)
count_words(word_counts, clean_texts)
print("Size of Vocabulary:", len(word_counts))
# Load Conceptnet Numberbatch's (CN) embeddings, similar to GloVe, but probably better
# (https://github.com/commonsense/conceptnet-numberbatch)
embeddings_index = {}
with open('numberbatch-en-17.04b.txt', encoding='utf-8') as f:
for line in f:
values = line.split(' ')
word = values[0]
embedding = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = embedding
print('Word embeddings:', len(embeddings_index))#g功能一共统计出现的词的个数
# Find the number of words that are missing from CN, and are used more than our threshold.
missing_words = 0
threshold = 20
for word, count in word_counts.items():
if count > threshold:
if word not in embeddings_index:
missing_words += 1
missing_ratio = round(missing_words/len(word_counts),4)*100
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)
# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
if word in embeddings_index:
word_embedding_matrix[i] = embeddings_index[word]
else:
# If word not in CN, create a random embedding for it
new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
embeddings_index[word] = new_embedding
word_embedding_matrix[i] = new_embedding
# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))
def convert_to_ints(text, word_count, unk_count, eos=False):
'''Convert words in text to an integer.
If word is not in vocab_to_int, use UNK's integer.
Total the number of words and UNKs.
Add EOS token to the end of texts'''
ints = []
for sentence in text:
sentence_ints = []
for word in sentence.split():
word_count += 1
if word in vocab_to_int:
sentence_ints.append(vocab_to_int[word])
else:
sentence_ints.append(vocab_to_int[""])
unk_count += 1
if eos:
sentence_ints.append(vocab_to_int[""])#进一步检查UNK和EOS
ints.append(sentence_ints)
return ints, word_count, unk_count
# Apply convert_to_ints to clean_summaries and clean_texts
word_count = 0
unk_count = 0
int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count)
int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)
unk_percent = round(unk_count/word_count,4)*100
print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))
def create_lengths(text):
'''Create a data frame of the sentence lengths from a text'''
lengths = []
for sentence in text:
lengths.append(len(sentence))#功能存一下当前每个句子的长度,以便用RNN训练时统计最长的那个数
return pd.DataFrame(lengths, columns=['counts'])
lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)
print("Summaries:")
print(lengths_summaries.describe())
print()
print("Texts:")
print(lengths_texts.describe())
打印结果:
def unk_counter(sentence):
'''Counts the number of time UNK appears in a sentence.'''
unk_count = 0
for word in sentence:
if word == vocab_to_int[""]:
unk_count += 1
return unk_count
# Sort the summaries and texts by the length of the texts, shortest to longest
# Limit the length of summaries and texts based on the min and max ranges.
# Remove reviews that include too many UNKs
sorted_summaries = []#功能执行一个排序和过滤的操作
sorted_texts = []
max_text_length = 84
max_summary_length = 13
min_length = 2
unk_text_limit = 1
unk_summary_limit = 0
for length in range(min(lengths_texts.counts), max_text_length):
for count, words in enumerate(int_summaries):
if (len(int_summaries[count]) >= min_length and
len(int_summaries[count]) <= max_summary_length and
len(int_texts[count]) >= min_length and
unk_counter(int_summaries[count]) <= unk_summary_limit and
unk_counter(int_texts[count]) <= unk_text_limit and
length == len(int_texts[count])
):
sorted_summaries.append(int_summaries[count])
sorted_texts.append(int_texts[count])
# Compare lengths to ensure they match
print(len(sorted_summaries))
print(len(sorted_texts))
打印结果:
6814 6814