随机森林 Word2Vec 文本分类

数据集是来自kaggle semantic classification任务的

1、加载文件

import pandas as pd
train = pd.read_csv(r"labeledTrainData\labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled=pd.read_csv(r"unlabeledTrainData\unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test=pd.read_csv(r"testData\testData.tsv", header=0, delimiter="\t", quoting=3)

2、输出数据格式

print(train.shape)
print(train.columns.values)
# train 第一个参数是列名
# print(train['review'][0])

print(unlabeled.shape)
print(unlabeled.columns.values)

print(test.shape)
print(test.columns.values)

3、删除停用词、只保留数字字母

import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk

stopwords_=set(stopwords.words("english"))
def review_to_words(raw_review,isStopwords=False):
    delete_label=BeautifulSoup(raw_review).get_text()
    letters_only=re.sub("[^a-zA-Z0-9]"," ",delete_label).lower().split()

    if isStopwords:
        letters_only=[words for words in letters_only if words not in stopwords_]
    return letters_only

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def tokenize_to_sentence(raw_para,tokenizer):
    raw_sentences=tokenizer.tokenize(raw_para.strip())
    sentences=[]
    for sent in raw_sentences:
        if(len(sent)>0):
            sentences.append(review_to_words(sent))
    return sentences

nums_train=len(train["review"])
nums_unlabeled=len(unlabeled["review"])
nums_test=len(test["review"])
clean_review_sentence=[]

for i in range(nums_train):
    clean_review_sentence+=tokenize_to_sentence(train["review"][i],tokenizer)
    if i%1000==0:
        print("*"*25,i)

for i in range(nums_unlabeled):
    clean_review_sentence+=tokenize_to_sentence(unlabeled["review"][i],tokenizer)
    if i%1000==0:
        print("*"*25,i)

for i in range(nums_test):
    clean_review_sentence+=tokenize_to_sentence(test["review"][i],tokenizer)
    if i%1000==0:
        print("*"*25,i)

4、显示log

print(len(clean_review_sentence))
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

 

5、Word2Vec

print(clean_review_sentence[0])
num_features=300
min_word_count=40
num_workers=4
context=10
downsampling=1e-3

from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(clean_review_sentence, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)
# If you don't plan to train the modl any further, calling 
# init_sims will make the model much more memory-efficient.

model.init_sims(replace=True)

model_name = "300features_40minwords_10context"
model.save(model_name)

6、Word2Vec 应用

model.doesnt_match("man woman child kitchen".split())
model.doesnt_match("france england germany berlin".split())
model.most_similar("awful")

7、句子的向量表示

import numpy as np

index2word_set = set(model.wv.index2word)

def getAvgFeatureVecs(paras,nums_features):
    counter=0
    reviews_vecs=[]
    for review in paras:
        if counter%1000==0:
            print("*"*25,counter)
        reviews_vecs.append(makeFeatureVec(review,nums_features))
        counter+=1
    return reviews_vecs
        
def makeFeatureVec(review,nums_features):
    features_vec=np.zeros((nums_features),dtype="float32")
    words=review
    sum=0
    for word in words:
        if word in index2word_set:
            features_vec=np.add(features_vec,model[word])
            sum+=1
    if sum!=0:
        features_vec=np.divide(features_vec,sum)
    return features_vec

clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_words( review ))

trainVec = getAvgFeatureVecs( clean_train_reviews, num_features )

print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_words( review))

testVec = getAvgFeatureVecs( clean_test_reviews, num_features )
    

 

8、随机森林

from sklearn.ensemble import RandomForestClassifier
forest=RandomForestClassifier(n_estimators=200)
print("Fitting a random forest to labeled training data...")
forest = forest.fit( trainVec, train["sentiment"] )

res=forest.predict(testVec)

# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":res} )
output.to_csv( "Word2Vec_AverageVectors.tsv", index=False, quoting=3 )

 

 

 

 

 

你可能感兴趣的:(自然语言处理,Python)