import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm
import matplotlib.pyplot as plt
import re #regex
import nltk
import string
# nltk.download('stopwords')
# from nltk.corpus import stopwords
import seaborn as sns
from scipy.sparse import coo_matrix
train_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/SentimentAnalysis/train.csv').copy()
test_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/SentimentAnalysis/test.csv').copy()
sample = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/SentimentAnalysis/sample_submission.csv').copy()
# Pre-processing step - from https://www.kaggle.com/rajaram1988/ignored-stop-words-using-only-word-counts
# drop entries with no text
train_data = train_data.dropna()
test_datat = test_data.dropna()
# convert text to lowercase
train_data['text'] = train_data['text'].map(lambda x: x.lower())
test_data['text'] = test_data['text'].map(lambda x: x.lower())
#remove '\\n',----re=regular expression(正则表达式),sub=substitute(替换);re.sub(字符串中需要替换的内容, 想要替换成什么内容, string需要被替换的总字符串)
train_data['text'] = train_data['text'].map(lambda x: re.sub('\\n','',str(x)))#将一列作为输入
test_data['text'] = test_data['text'].map(lambda x:re.sub('\\n',' ',str(x)))
# remove any text starting with User...删除以用户开头的任何文本。。。
train_data['text'] = train_data['text'].map(lambda x: re.sub("\[\[User.*", '', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub("\[\[User.*", '', str(x)))
# remove IP addresses or user IDs
train_data['text'] = train_data['text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', str(x)))
#remove http links in the text
train_data['text'] = train_data['text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)", '', str(x)))
x_train,x_val = train_test_split(train_data,train_size=0.8,random_state=23)
## set aside positive/negative/neutral tweets拆分积极/消极/中立的微博
positive_tweets = x_train[x_train['sentiment'] == 'positive']
negative_tweets = x_train[x_train['sentiment'] == 'negative']
neutral_tweets = x_train[x_train['sentiment'] == 'neutral']
# get lengths of 'selected_text' for non-neutral tweet
pos_selected_lengths = positive_tweets['selected_text'].map(lambda x: len(x.split()))
neg_selected_lengths = negative_tweets['selected_text'].map(lambda x: len(x.split()))
#plot 'selected_text' lengths against frequency in sentiment
plt.figure(figsize=(12, 6))
p1 = sns.kdeplot(pos_selected_lengths, shade=True, color="b").set_title('Selected Text lengths across Positive and Negative Sentiments')
p2 = sns.kdeplot(neg_selected_lengths, shade=True, color="r")
plt.legend(labels=['positive', 'negative'])
# based on this plot we can see that selected_text is more frequently shorter in positive tweets
# get lengths of 'text' for non-neutral tweets
pos_lengths = positive_tweets['text'].map(lambda x: len(x.split()))
neg_lengths = negative_tweets['text'].map(lambda x: len(x.split()))
# let's look at the lengths of positive vs negative tweets
p1 = sns.kdeplot(pos_lengths, shade=True, color="b").set_title('Text Lengths across Positive and Negative Sentiments')
p2 = sns.kdeplot(neg_lengths, shade=True, color="r")
plt.legend(labels=['positive', 'negative'])
# this doesn't seem all that useful. tweet lengths are distributed evenly in positive and negative tweets
#这似乎没什么用。 推文长度在正面和负面推文中平均分布
# create feature vectors that include ngrams of size max_ngram 创建包含大小为max_ngram的ngram的特征向量
# so we can select a feature that is a word or phrase to be our 'selected_text'因此我们可以选择一个单词或短语作为特征“ selected_text”
max_ngram = max(pos_selected_lengths) if max(pos_selected_lengths) > max(neg_selected_lengths) else max(neg_selected_lengths)#----30
min_ngram = min(pos_selected_lengths) if min(pos_selected_lengths) > min(neg_selected_lengths) else min(neg_selected_lengths)#----1
# this ended up being fruitless
第一种方法是在用 CountVectorizer 类向量化之后再调用 TfidfTransformer 类进行预处理。
第二种方法是直接用 TfidfVectorizer 完成向量化与 TF-IDF 预处理。
vectorizer = CountVectorizer(
max_features=10000,#对所有关键词的term frequency进行降序排序,只取前max_features个作为关键词集
# let's remove all neutral tweets from x_train so that we can train the cvm properly
non_neutral = x_train[x_train['sentiment'] != 'neutral']
#CountVectorizer(词袋法(Bag of Words))向量化特征’text‘
# fit the vectorizer to the non_neutral training data
train_vectors = vectorizer.fit_transform(non_neutral['text'])
x_pos = vectorizer.transform(positive_tweets['text'])
x_neg = vectorizer.transform(negative_tweets['text'])
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_vectors = tfidf_transformer.fit_transform(train_vectors)
#svm实例化,#C: 目标函数的惩罚系数C,用来平衡分类间隔margin和错分样本的,default C = 1.0
supportVector = svm.SVC(C=1.0,kernel='linear') ##kernel:参数选择有rbf, linear, poly, Sigmoid, 默认的是"RBF"
#tfidf_vectors是经过TF-IDF过的,这里是SVM训练tfidf_vectors是训练集, non_neutral['sentiment']是目标值
supportVector.fit(tfidf_vectors, non_neutral['sentiment'])
weights_dict = {
features = vectorizer.get_feature_names()#features有5979个样本
for i in range(len(features)):
feature = features[i]
weights_dict[feature] = supportVector.coef_[0, i]
# get bag of words and weights of key words/phrases in negative tweets
#weights_list使weights_dict字典变成列表,weights_sorted使weights_list 列表按权重从高到低进行排序
weights_list = [(word, weights_dict[word])for word in features]
weights_sorted = sorted(weights_list, key=lambda x: x[1],reverse=True)
# get the top 50 words and plot them
top_50_words = weights_sorted[:50]
weight_top_df = pd.DataFrame(top_50_words)#将列表表格化
weight_top_df.columns = ["Word", "Weight"]#设置表格的特征名(没有设置的话在化为表格时,特征名是0,1.。。。)
'figure.figsize': (13, 8)})
g = sns.barplot(x="Word", y="Weight", data=weight_top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=60)
# get the bottom 50 words and plot them
bot_50_words = weights_sorted[len(weights_sorted) - 50:]#可以【-50:】
weight_bot_df = pd.DataFrame(bot_50_words)
weight_bot_df.columns = ["Word", "Weight"]
'figure.figsize': (13, 8)})
g = sns.barplot(x="Word", y="Weight", data=weight_bot_df)
g.set_xticklabels(g.get_xticklabels(), rotation=60)
#it looks like positive words have a negative weight and negative words have a positive weight????????????????????????????????????
# so we should calculate selected_text based on sentiment
inv_weights_dict = {
for key in weights_dict.keys():
inv_weights_dict[key] = weights_dict[key] * -1
#row--x:一行的内容(series格式,可以用表格中的特征名进行检索), tol:0.0015, a:5
def calc_selected_text(x, tol, a):
tweet = x['text']#tweet是内容
sentiment = x['sentiment']#sentiment代表该句的情绪
if sentiment == 'neutral':#如果是中性的,就直接返回
return tweet
if sentiment == 'positive':#如果是积极的,取颠转前的权重
weights = weights_dict
if sentiment == 'negative':##如果是消极的,取颠转后的权重,总之是保证得到正权重
weights = inv_weights_dict
text = tweet.split()#将字符串分为一个个单词
text_len = len(text)#单词的个数
#[str(i) for i in ls1]这是列表生成式,等价于第一步:ls2 = [],第二步:for i in ls1:
# ls2.append( str (i))
subsets = [text[i:j+1]for i in range(text_len) for j in range(i, text_len)]
subsets = sorted(subsets, key=len)
score = 0
selected = ''
for i in range(len(subsets)):
subtr_sum = 0
for p in range(len(subsets[i])):
#str.translate(maketrans(intab, outtab)),将字符串中指定的单字符intab,转换为指定的单字符outtab,这里是可以把一个单词中如果存在标点符号,就把他去掉
words_in_substr = subsets[i][p].translate(str.maketrans('', '', string.punctuation))
if(words_in_substr in weights.keys()):
# We noticed that our selected strings were ~375% longer than they should be, so we implemented a "cost function"
# to encourage smaller strings
subtr_sum += weights[words_in_substr] - a * (len(words_in_substr) / text_len)#单词长度除以句子的单词总长度
#tol = tol*5 # Increase the tolerance a bit each time we choose a selection
#tol = tol * 5#每次我们选择一个选项时,都会稍微提高公差
if(subtr_sum > score + tol):
score = subtr_sum
selected = subsets[i]
if len(selected) == 0:
selected = text
return ' '.join(selected)
# from https://www.kaggle.com/rajaram1988/ignored-stop-words-using-only-word-counts
def jaccard(str1, str2):
a = set(str1.lower().split())
b = set(str2.lower().split())
# print("{} - {}".format(str1, str2))
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
# to prevent warnings appearing in the console防止在控制台中出现警告的步骤
pd.options.mode.chained_assignment = None
# some parameters for the text selector
tol = 0.0015
a = 5
print("tol = {}".format(tol))#----0.0015
print("a = {}".format(a))#------5
# make predictions on training set
x_train['prediction'] = ''#在训练集中添加预测栏
for key, row in x_train.iterrows():#iterrows() 函数对dataframe进行遍历,key是行的索引,row每一行的内容(series形式)。tol,a是参数
selected = calc_selected_text(row, tol, a)
x_train.loc[x_train['textID'] == row['textID'], ['prediction']] = selected
x_train['jaccard'] = x_train.apply(lambda x: jaccard(x['selected_text'], x['prediction']), axis=1)
print('Jaccard for training set = ', np.mean(x_train['jaccard']))
# make predictions on validation set
x_val['prediction'] = ''
for key, row in x_val.iterrows():
selected = calc_selected_text(row, tol, a)
x_val.loc[x_val['textID'] == row['textID'], ['prediction']] = selected
x_val['jaccard'] = x_val.apply(lambda x: jaccard(x['selected_text'], x['prediction']), axis=1)
print('Jaccard for validation set = ', np.mean(x_val['jaccard']))
# make final submission
test_data['prediction'] = ''
for index, row in test_data.iterrows():
selected_text = calc_selected_text(row, tol, a)
sample.loc[sample['textID'] == row['textID'], ['selected_text']] = selected_text
sample.to_csv('submission.csv', index=False)