项目1并没有直接开始搭建网络,而是从直观上对我们的数据进行分析,并且掌握一些有用的库,具体包括
import numpy as np
import pandas as pd
reviews = pd.read_csv('reviews.txt', header=None)
reviews.head()
0 | |
---|---|
0 | bromwell high is a cartoon comedy . it ran at … |
1 | story of a man who has unnatural feelings for … |
2 | homelessness or houselessness as george carli… |
3 | airport starts as a brand new luxury pla… |
4 | brilliant over acting by lesley ann warren . … |
labels = pd.read_csv('labels.txt', header=None)
labels.head()
0 | |
---|---|
0 | positive |
1 | negative |
2 | positive |
3 | negative |
4 | positive |
from collections import Counter
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
for review, label in zip(reviews.values, labels.values):
words = review[0].split(' ')
total_counts.update(words);
if label == 'positive':
positive_counts.update(words)
else:
negative_counts.update(words)
len(total_counts)
74074
positive_counts.most_common(20)
[(”, 550468),
(‘the’, 173324),
(‘.’, 159654),
(‘and’, 89722),
(‘a’, 83688),
(‘of’, 76855),
(‘to’, 66746),
(‘is’, 57245),
(‘in’, 50215),
(‘br’, 49235),
(‘it’, 48025),
(‘i’, 40743),
(‘that’, 35630),
(‘this’, 35080),
(’s’, 33815),
(‘as’, 26308),
(‘with’, 23247),
(‘for’, 22416),
(‘was’, 21917),
(‘film’, 20937)]
negative_counts.most_common(20)
[(”, 561462),
(‘.’, 167538),
(‘the’, 163389),
(‘a’, 79321),
(‘and’, 74385),
(‘of’, 69009),
(‘to’, 68974),
(‘br’, 52637),
(‘is’, 50083),
(‘it’, 48327),
(‘i’, 46880),
(‘in’, 43753),
(‘this’, 40920),
(‘that’, 37615),
(’s’, 31546),
(‘was’, 26291),
(‘movie’, 24965),
(‘for’, 21927),
(‘but’, 21781),
(‘with’, 20878)]
word2index = dict()
for idx,word in enumerate(total_counts.keys()):
word2index[word] = idx
word2index['apple']
25600