预处理主要是处理数据集B中的原始数据和数据集A中的文本,目的是将包含网页链接、标点符号及无意义单词的杂乱数据转化为干净的由有意义单词组成的字符串,并使用pandas dataframe存储。
初始爬取到的推文为json格式,因此需要读取json文件,并将其中的每条推文存储到列表中,目标格式为 [‘今天天气真好’,‘This is my bag’]
使用pandas dataframe存储处理后的文本数据并保存为csv格式文件
# 获取处理后的英文推特
if __name__ == '__main__':
storename = 'D:/data/test.csv'
tweets = get_tweets(filepath)
df = pd.DataFrame()
df['tweets'] = tweets
# 分词
df['tweets'] = df['tweets'].apply(get_word)
# 词形还原
df['tweets'] = df['tweets'].apply(get_pos_word)
# 删除tweets中的空列表
df = df[~(df['tweets'].str.len() == 0)]
# 将列表转换为字符串
df['tweets'] = df['tweets'].apply(lambda x: ' '.join(x))
# 保存文本
df.to_csv(storename, encoding='utf-8')
不同于处理数据集B,数据集A为行列数据整齐的英文文本,但是没有列名,且有几列多余数据,因此主要加入了对csv格式文件的读取和处理,注意本文使用的数据集A编码格式为 ISO-8859-1,大家读取csv文件 读取csv文件 encoding=‘ISO-8859-1’ 报错时改成 encoding=‘utf-8’
if __name__ == '__main__':
file = pd.read_csv('D:/data/Train.csv', encoding='ISO-8859-1',
header=None, names=['label', 'id', 'day', 'query', 'user', 'tweets']) #pandas dataframe自定义列表名
file = file.drop(file.columns[1:5], axis=1) #删除多余列
df['tweets'] = df['tweets'].apply(remove_urls)
# 分词
df['tweets'] = df['tweets'].apply(get_word)
# 文本处理结果
df['tweets'] = df['tweets'].apply(get_pos_word)
# 删除tweets中的空列表
df = df[~(df['tweets'].str.len() == 0)]
# 转换字符串
df['tweets'] = df['tweets'].apply(lambda x: ' '.join(x))
# 打乱顺序
df = df.sample(frac=1.0).reset_index(drop=True)
df.to_csv("D:\data\Test5000.csv", encoding='utf-8',index=None)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
bow_vectorizer = CountVectorizer(max_df=0.80, min_df=2, max_features=5000)
# TF-IDF feature
tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=2, max_features=5000)
from sklearn.model_selection import train_test_split
df = pd.read_csv("D:\data\Train.csv", encoding='utf-8')
x=df['tweets'] #自变量
y=df['label'] #因变量
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) #划分测试集和训练集
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# KNN Classifier K近邻算法
Knn = KNeighborsClassifier()
# Logistic Regression Classifier 逻辑回归
Lr = LogisticRegression()
# Random Forest Classifier 随机森林
Rf = RandomForestClassifier()
# SVM Classifier 支持向量机
Svm = SGDClassifier()
# Naive Bayes 朴素贝叶斯
Nb = MultinomialNB()
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(bow_vectorizer, Nb) #词袋模型串联朴素贝叶斯算法
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test) #进行预测
df['pred']=y_pred #将预测结果保存到dataframe中
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))
from sklearn.externals import joblib
joblib.dump(pipe, 'D:/data/Bayes.pkl')
Bayes=joblib.load('D:/data/Bayes.pkl') #加载模型
y_pred = Bayes.predict(x_test) #预测结果和pipe一致
df['pred02']=y_pred #将预测结果保存到dataframe中
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.externals import joblib
df = pd.read_csv("D:\data\Train.csv", encoding='utf-8')
df = df.sample(frac=1.0).reset_index(drop=True)
x=df['tweets'] #自变量
y=df['label'] #因变量
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) #划分测试集和训练集
tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=2)
# SVM Classifier
Svm = SGDClassifier()
pipe = make_pipeline(tfidf_vectorizer, Svm)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test) #预测
df['pred']=y_pred #保存预测结果到dataframe中
print(metrics.classification_report(y_test, y_pred)) #评估
joblib.dump(pipe, 'D:/data/SVM.pkl') #保存模型