参考:pandas找出重复行后取均值并合并
import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline
from ggplot import *
theme_bw()
ggplot麻烦的很,内部用的是老pandas的东西,比如sort之类的,还有一个date啥啥也有问题。如果要解决只能手动修改py文件。
import numpy as np
import pymongo,pandas as pd
from bson import ObjectId
import matplotlib as mb
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import plotnine as p9
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from dateutil import parser
from ggplot import *
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
pcfr = pd.read_excel('hair_dryer.xlsx')
df = pcfr
# '=='后面替换品牌名字即可
m = df[df['product_title']=='remington ac2015 t|studio salon collection pearl ceramic hair dryer, deep purple']
m = df
中间 df[‘review_date’] = pd.to_datetime(df[‘review_date’])的时候会有玄学报错……整表和按品牌分类后的方法不一样……
def s_c_f(df):
# 去重
df.duplicated().value_counts()
# NaN remove
df['review_body'].str.split(expand = True)
# date format convert
# 切表用这个
# df['review_date'] = df.review_date.apply(lambda x:parser.parse(x))
# 整表用这个
df['review_date'] = pd.to_datetime(df['review_date'])
#将date设置为index
df = df.set_index('review_date')
## sentiment analysis
# func for polarity
def sentiment_calc(text):
try:
return TextBlob(text).sentiment.polarity
except:
return None
# func for subjectivity
def sentiment_calc_sub(text):
try:
return TextBlob(text).sentiment.subjectivity
except:
return None
df['polarity'] = df['review_body'].apply(sentiment_calc)
df['subjectivity'] = df['review_body'].apply(sentiment_calc_sub)
return df
只画polarity
def drawfig_polarity(result):
# 在s_t_f中,通过set_index,把日期设为index,就可以用这种方式求年平均值
#m = result.groupby(result.index.year).mean()
'''reset_index by default does not modify the DataFrame;
it returns a new DataFrame with the reset index.
If you want to modify the original,
use the inplace argument: df.reset_index(drop=True, inplace=True).
Alternatively, assign the result of reset_index by doing df = df.reset_index(drop=True).'''
new = result.reset_index()
plot = ggplot(aes(x='review_date', y='polarity'), data=new) + \
geom_point() + \
geom_line(color = 'blue') + \
stat_smooth(span = 0.1)
return plot
三个都画上去
def drawfig_star(df):
# 在s_t_f中,通过set_index,把日期设为index,就可以用这种方式求年平均值
#d = result.groupby(result.index.year).mean()#求完其实自动删除了不是数值的列
'''reset_index by default does not modify the DataFrame;
it returns a new DataFrame with the reset index.
If you want to modify the original,
use the inplace argument: df.reset_index(drop=True, inplace=True).
Alternatively, assign the result of reset_index by doing df = df.reset_index(drop=True).'''
#df = d.drop(['helpful_votes','total_votes','help_precentage'],axis=1)#通过列名指定列删除,axis默认0是行,=1是列
# 统一尺度
#df['subjectivity'] = df['subjectivity'].map(lambda x: x*45*0.76*0.8*0.7*0.4*1.08)
#df['polarity'] = df['polarity'].map(lambda x: x*45*0.76*0.8*0.7)
def normalize(data):
return (data - data.mean()) / data.std()
# df['subjectivity'] = df['subjectivity'].map(lambda x: normalize(x))
df['subjectivity'] = normalize(df['subjectivity'])
# df['polarity'] = df['polarity'].map(lambda x: normalize(x))
df['polarity'] = normalize(df['polarity'])
df['star_rating'] = normalize(df['star_rating'])
#ggplot
df['x'] = df.index
df = pd.melt(df, id_vars='x')
plot = ggplot(aes(x='x', y='value', color='variable'), df) + \
geom_point() + stat_smooth(span = 0.1) + geom_line()
plot.make()
# plot.fig.set_size_inches(30, 5, forward=True)
# plot.fig.set_dpi(100)
# plot.fig
return plot
result = s_c_f(m)
把没必要的去掉。队友洗完的数据里有precentage,persentage,percentage,最开始报错搞好久。
# 在s_t_f中,通过set_index,把日期设为index,就可以用这种方式求年平均值
# d = result.groupby(result.index.day).mean()#求完其实自动删除了不是数值的列
#d
resultdf = result.drop(['review_body','product_title','helpful_votes','total_votes','help_precentage','vine','verified_purchase','review_headline'],axis=1)#通过列名指定列删除,axis默认0是行,=1是列
resultdf
我们有好多数据,每天的条数不一样,我想每天只有一条,怎么取平均去重再放回去我想了好久,其实方法好简单。
# raw_df=resultdf.groupby(resultdf.index.day).mean()
# raw_df = resultdf.groupby([resultdf.index.year,resultdf.index.month]).mean()
raw_df = resultdf.groupby([resultdf.index.year.rename('year'), resultdf.index.month.rename('month')]).mean()
这时候我们已经完成了重复日期取平均放回去的去重。但是怎么处理前面的year,month是另一个问题。现在长这个样子。
先把他们放回去。注意不管是set_index()还是reset_index()操作,本身不改变,要用另一个变量接住。
同时新建一列day。
raw_df = raw_df.reset_index()
raw_df['day']=1
一个非常美貌的函数。生成date_time格式的日期。但是必须得3列。所以我们前面才会新搞一个day出来。
from datetime import datetime
# df['review_date'] = df.apply(lambda row: datetime(row['year'], row['month']), axis=1)
raw_df['review_date'] = pd.to_datetime(raw_df[['year','month','day']])
raw_df.sort_index(ascending=False,inplace=True)
把之前没用的扔掉。
raw_df = raw_df.drop(['year','month','day'], axis=1)
raw_df = raw_df.set_index('review_date')
raw_df
plot1 = drawfig_polarity(raw_df)
plot1.save(filename = 'new_plot')
#将date设置为index
plot2 = drawfig_star(raw_df)
#ggsave(filename="new_plot2.jpg", width=20, height=4, units='in', plot=plot2)
#plot2.save(filename='new_plot2')