Kaggle入门赛BagofWordsMeetsBagsofPopcorn(一)数据预处理

 

1.re.sub(pattern,repl,string,count=0)

letters_only= re.sub("[^a-zA-Z]", " ", review_text)将文本中非字母的字符替换为空格

pattern,表示正则中的模式字符串repl,就是replacement,被替换,的字符串的意思。repl可以是字符串,也可以是函数

string,即表示要被处理,要被替换的那个string字符串。

Count:指定执行replce的字符串的个数,默认为0,即对所有符合条件的都执行。

2. stopwords.words("english"),加载英文的停用词

3. CountVectorizer将文本文档集合转换为token计数矩阵,可参考我的另一篇笔记。

4. 此处粗略地使用了随机森林来训练模型并预测,同时没有使用到unlabeledTrainData,效果不佳。

Kaggle入门赛BagofWordsMeetsBagsofPopcorn(一)数据预处理_第1张图片

 

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

#清洗数据
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join(meaningful_words)

#加载以标注训练集
def load_train_data():  
    train = pd.read_csv("dataSet/labeledTrainData.tsv", header=0, \
                        delimiter="\t", quoting=3)
    # Get the number of reviews based on the dataframe column size
    num_reviews = train["review"].size
    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []
    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list 
    for i in range(num_reviews ):
        # Call our function for each one, and add the result to the list of
        # clean reviews
        clean_train_reviews.append( review_to_words( train["review"][i] ) )
    return train,np.array(clean_train_reviews), np.array(train['sentiment'])

#加载未标注训练集
def load_unlabeled_train_data():
    unlabeled_train = pd.read_csv("dataSet/unlabeledTrainData.tsv", header=0, \
                                  delimiter="\t", quoting=3)
    # Get the number of reviews based on the dataframe column size
    num_reviews = unlabeled_train["review"].size
    # Initialize an empty list to hold the clean reviews
    clean_unlabeled_train_reviews = []
    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list 
    for i in range(num_reviews ):
        # Call our function for each one, and add the result to the list of
        # clean reviews
        clean_unlabeled_train_reviews.append( review_to_words(unlabeled_train["review"][i]))
    return unlabeled_train,np.array(clean_unlabeled_train_reviews)

#加载测试集
def load_test_data():
    test = pd.read_csv("dataSet/testData.tsv", header=0, \
                    delimiter="\t", quoting=3)
    # Get the number of reviews based on the dataframe column size
    num_reviews = test["review"].size
    # Initialize an empty list to hold the clean reviews
    clean_test_reviews = []
    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list 
    for i in range(num_reviews ):
        # Call our function for each one, and add the result to the list of
        # clean reviews
        clean_test_reviews.append(review_to_words(test["review"][i]))
    return test,np.array(clean_test_reviews)

def text2vec(trainArr):
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.  选取频数最高的5000个词作为特征
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of strings. 
    train_data_features = vectorizer.fit_transform(trainArr)
    # Numpy arrays are easy to work with, so convert the result to an array 
    train_data_features = train_data_features.toarray()
    # Take a look at the words in the vocabulary
    vocab = vectorizer.get_feature_names()
    # Sum up the counts of each vocabulary word
    dist = np.sum(train_data_features, axis=0)
    return train_data_features,vocab, dist
    
train,x_train,y_train = load_train_data()
unlabeled_train,x_unlabeled_train = load_unlabeled_train_data()
test,x_test = load_test_data()

train_data_features,vocab, dist = text2vec(x_train)
test_data_features, test_vocab, test_dist=text2vec(x_test)

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 
forest.fit(train_data_features, y_train)
result = forest.predict(test_data_features)

output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Bag_of_Words_model1.csv", index=False, quoting=3 )

 

 

 

 

 

你可能感兴趣的:(机器学习,自然语言处理,word2vec,kaggle,数据预处理)