数据集是来自kaggle semantic classification任务的
1、加载文件
import pandas as pd
train = pd.read_csv(r"labeledTrainData\labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled=pd.read_csv(r"unlabeledTrainData\unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test=pd.read_csv(r"testData\testData.tsv", header=0, delimiter="\t", quoting=3)
2、输出数据格式
print(train.shape)
print(train.columns.values)
# train 第一个参数是列名
# print(train['review'][0])
print(unlabeled.shape)
print(unlabeled.columns.values)
print(test.shape)
print(test.columns.values)
3、删除停用词、只保留数字字母
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk
stopwords_=set(stopwords.words("english"))
def review_to_words(raw_review,isStopwords=False):
delete_label=BeautifulSoup(raw_review).get_text()
letters_only=re.sub("[^a-zA-Z0-9]"," ",delete_label).lower().split()
if isStopwords:
letters_only=[words for words in letters_only if words not in stopwords_]
return letters_only
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def tokenize_to_sentence(raw_para,tokenizer):
raw_sentences=tokenizer.tokenize(raw_para.strip())
sentences=[]
for sent in raw_sentences:
if(len(sent)>0):
sentences.append(review_to_words(sent))
return sentences
nums_train=len(train["review"])
nums_unlabeled=len(unlabeled["review"])
nums_test=len(test["review"])
clean_review_sentence=[]
for i in range(nums_train):
clean_review_sentence+=tokenize_to_sentence(train["review"][i],tokenizer)
if i%1000==0:
print("*"*25,i)
for i in range(nums_unlabeled):
clean_review_sentence+=tokenize_to_sentence(unlabeled["review"][i],tokenizer)
if i%1000==0:
print("*"*25,i)
for i in range(nums_test):
clean_review_sentence+=tokenize_to_sentence(test["review"][i],tokenizer)
if i%1000==0:
print("*"*25,i)
4、显示log
print(len(clean_review_sentence))
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
5、Word2Vec
print(clean_review_sentence[0])
num_features=300
min_word_count=40
num_workers=4
context=10
downsampling=1e-3
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(clean_review_sentence, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
# If you don't plan to train the modl any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)
6、Word2Vec 应用
model.doesnt_match("man woman child kitchen".split())
model.doesnt_match("france england germany berlin".split())
model.most_similar("awful")
7、句子的向量表示
import numpy as np
index2word_set = set(model.wv.index2word)
def getAvgFeatureVecs(paras,nums_features):
counter=0
reviews_vecs=[]
for review in paras:
if counter%1000==0:
print("*"*25,counter)
reviews_vecs.append(makeFeatureVec(review,nums_features))
counter+=1
return reviews_vecs
def makeFeatureVec(review,nums_features):
features_vec=np.zeros((nums_features),dtype="float32")
words=review
sum=0
for word in words:
if word in index2word_set:
features_vec=np.add(features_vec,model[word])
sum+=1
if sum!=0:
features_vec=np.divide(features_vec,sum)
return features_vec
clean_train_reviews = []
for review in train["review"]:
clean_train_reviews.append( review_to_words( review ))
trainVec = getAvgFeatureVecs( clean_train_reviews, num_features )
print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
clean_test_reviews.append( review_to_words( review))
testVec = getAvgFeatureVecs( clean_test_reviews, num_features )
8、随机森林
from sklearn.ensemble import RandomForestClassifier
forest=RandomForestClassifier(n_estimators=200)
print("Fitting a random forest to labeled training data...")
forest = forest.fit( trainVec, train["sentiment"] )
res=forest.predict(testVec)
# Write the test results
output = pd.DataFrame( data={"id":test["id"], "sentiment":res} )
output.to_csv( "Word2Vec_AverageVectors.tsv", index=False, quoting=3 )