kaggle竞赛——入门二(Natural Language Processing with Disaster Tweets)

比赛地址:https://www.kaggle.com/c/nlp-getting-started/submit

"""
__author__:shuangrui Guo
__description__:
"""
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.stem import SnowballStemmer
import seaborn as sns
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest ,chi2

#读取数据集
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
#在keyword存在61个空值,location列存在2533个空值
#print(train.isnull().sum())

#可视化,在条形图顶部显示数字
# plt.figure()
# ax = train['target'].value_counts().plot.bar()
# for p in ax.patches:
#     ax.annotate(np.round(p.get_height(),decimals=2),
#                 ((p.get_x()+p.get_width()/2.0),p.get_height()),
#                 ha='center',
#                 va='center',
#                 xytext=(0,5),
#                 textcoords='offset points')
# plt.title('True vs False Disaster Tweets')
# plt.xlabel('True vs False')
# plt.xticks(rotation=360)
# plt.show()

#清洗文本
#使用SnowballStemmer来把句子中的单词词干化
stemmer = SnowballStemmer('english')
stopwords_list = stopwords.words('english')

def clean_content(string:str):
    cleaned = []
    temp=re.sub("[^a-zA-Z]"," ",string).split()
    for word in temp:
        if word not in stopwords_list:
            cleaned.append(stemmer.stem(word))
    return " ".join(cleaned).lower()
train['cleaned']=train['text'].apply(clean_content)

#步骤二:去除一些没有用的词与符号
def review_cleaning(text):
    text = re.sub(r'([!”#$%&’()*+,-./:;<=>?[\]^_`{|}~])'," ",text)
    text = re.sub(r'http',' ',text)
    text = re.sub(r'https',' ',text)
    text = re.sub(r'http\S+',' ',text)
    text = re.sub(r'https\S+',' ',text)
    text = re.sub(r'co',' ',text)
    text = re.sub(r'\s+',' ',text)
    text = re.sub(r'\d+',' ',text)
    text = re.sub(r'[^a-zA-Z0-9]+',' ',text)
    return text

train['cleaned'] = train['cleaned'].apply(review_cleaning)

#删除一些只有一个单词的行:
train['cleaned'] = [t for t in train['cleaned'] if len(t)>1]

#创建训练集与测试集
#train['cleaned'] = train['cleaned'].values


#创建tf-idf
tfidf = TfidfVectorizer(analyzer='word',
                        max_features=10000,
                        ngram_range=(1,3),
                        stop_words='english')
X = tfidf.fit_transform(train['cleaned'])
X_train,X_test,y_train,y_test = train_test_split(X,train['target'].tolist(),test_size=0.2,stratify=train['target'].tolist())

pipeline = Pipeline(
    [('mutual_info_classif',SelectKBest(chi2,k=6500)),
     ('classifier',SVC(kernel='rbf',random_state=0,verbose=True,gamma=1,C=1,degree=6,shrinking=True,probability=False,cache_size=5))]
)

model = pipeline.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(f1_score(y_test,y_pred))

#在真正的测试集上进行预测并保存
test['cleaned'] = test['text'].apply(clean_content)
test['cleaned'] = test['cleaned'].apply(review_cleaning)

testing = tfidf.transform(test['cleaned'])
test_pred = model.predict(testing)
test['target'] = test_pred
columns = ['id','target']
submission = test[columns]
submission.to_csv('./submission.csv',index=False)

目前的不足:

文本清洗部分觉得有些奇怪

使用TFIDF的结果直接去划分训练集和测试集不能理解

SelectKBest的作用不清楚

Pipeline的使用不了解

kaggle竞赛——入门二(Natural Language Processing with Disaster Tweets)_第1张图片

你可能感兴趣的:(kaggle,可视化,python,正则表达式,kaggle,自然语言处理)