词云:
import jieba
from imageio import imread
from numpy import unicode
from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
jieba.load_userdict("stoplist.txt")
"""seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print(", ".join(seg_list))"""
#***************************-------test***************
back_color=imread('girl.jpg') #导入背景图片
wc=WordCloud(background_color='white', #背景yanse
max_words=100, #允许最大词数
mask=back_color, #忽略width和height
max_font_size=100, #显示字体的最大值
font_path="D:\\pythonProject2\\simhei.ttf", #解决显示口字型乱码问题
random_state=42, #为每个词返回一个PIL颜色
)
text=open('data_m_content.txt').read() #打开词源文本
def stop_words(texts):
words_list=[]
#for i in range(0,len(texts)):
word_generator=jieba.cut_for_search(texts) #分词
with open('stoplist.txt',encoding='utf-8') as f:
str_text=f.read()
unicode_text=unicode(str_text)
f.close()
for word in word_generator:
if word.strip() not in unicode_text: #去除停用词
words_list.append(word)
return ' '.join(words_list)
text=stop_words(text)
wc.generate(text)
image_colors=ImageColorGenerator(back_color) #基于彩色图像生成相应彩色
plt.imshow(wc) #显示图片
plt.axis('off') #关闭坐标轴
plt.figure() #绘制词云
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis('off')
wc.to_file('data_m.png') #保存图片
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
path="D:\\pythonProject2\\all.csv"
df=pd.read_csv(path)
df.dropna()
data1=[]
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif' #设置中文问题
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False
for x in range(0,len(df['times'])):
if df["times"][x][9:11]==' ':
data1.append('blank')
else:
data1.append(df["times"][x][9:11].strip(":"))
df['times']=data1
df.value_counts()
path1="D:\pythonProject2\content_hour.txt"
#count={}
with open(path1,encoding='utf-8') as f:
lines=f.readlines()
#print(lines)
for i in range(0,len(lines)):
lines[i]=lines[i].strip(' ')
if len(lines[i])<3:
lines[32]='no 8'
#pd1=pd.DataFrame(lines)
#print(pd1)
counts={'0':0,'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0,'9':0,'10':0,'11':0,'12':0,
'13':0,'14':0,'15':0,'16':0,'17':0,'18':0,'19':0,'20':0,'21':0,'22':0,'23':0}
data3=[]
flag=0
def count(flag):
a=0
data2 = []
for i in range(0,len(lines)):
#print(lines[i][0:2])
if lines[i][0:2]=='no':
continue
if eval(lines[i][0:2])==flag:
#print(eval(lines[i][2:-1]))
data2.append(eval(lines[i][2:]))
#print(data2)
for n in range(0,len(data2)):
a=a+data2[n]
return a
#eval(lines[i][2:])
#print(lines)
#print(count(3))
for x in range(0,24):
data3.append(count(x))
print(data3)
df1=pd.DataFrame(index=counts,columns=['counts'] )
df1['counts']=data3
plt.plot(df1)
for m, n in zip(df1.index,df1['counts']):
plt.text(m,n,n,ha='center',va='bottom',fontsize=8)
#print(df1)
#print(count(1))
#print(data2)
#print(data2)
#plt.xticks(rotation=90)
plt.xlabel("时间(小时)")
plt.ylabel("评论(数量)")
plt.show()
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import unicode
import jieba
pd.set_option('display.max_rows',None)
path="D:\\pythonProject2\\all.csv"
df=pd.read_csv(path)
df.dropna()
data1=[]
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif' #设置中文问题
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False
df1=df['citys'].value_counts().drop(['blank']) #blank是我清洗数据时给城市为空的赋值,因为也许其他列还有用所以没有直接删
df2=pd.DataFrame(df1[:10])
df2.plot(kind='bar')
#plt.plot(df2)
#print(df1)
#print(data2)
#print(data2)
#plt.xticks(rotation=90)
plt.xlabel("城市")
plt.ylabel("评论(数量)")
plt.xticks(rotation=45)
plt.show()
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import pandas as pd
from bokeh.plotting import figure, show, output_file
#北京 828
#上海 495
#广东广州 144
#浙江杭州
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif' #设置中文问题
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False
data1=[]
data2=[]
data3=[]
data4=[]
df= pd.read_csv('all.csv')
#df1=df['citys'].value_counts().drop(['blank']) #blank是我清洗数据时给城市为空的赋值,因为也许其他列还有用所以没有直接删
#df2=pd.DataFrame(df1[:4])
for x in range(0,len(df['scores'])): # 再次清洗去除列表数据中的''
if df['citys'][x]=='北京':
data1.append(float(df['scores'][x])/50)
if df['citys'][x]=='上海':
data2.append(float(df['scores'][x]) / 50)
if df['citys'][x] == '广东广州':
data3.append(float(df['scores'][x]) / 50)
if df['citys'][x]=='浙江杭州':
data4.append(float(df['scores'][x]) / 50)
"""for y in range(0,len(data1)):
if len(data2)
print(data3)
df1=pd.DataFrame({'北京':data1})
df2=pd.DataFrame({'上海':data2})
df3=pd.DataFrame({'广东广州':data3})
df4=pd.DataFrame({'浙江杭州':data4})
print(df3)
plt.figure(figsize=(10,4))
# 创建图表、数据
def draw(df1):
f = df1.boxplot(return_type='dict')
plt.title('城市')
for box in f['boxes']:
box.set( color='b', linewidth=1) # 箱体边框颜色
box.set( alpha=0.5) # 箱体内部填充颜色
for whisker in f['whiskers']:
whisker.set(color='k', linewidth=0.5,linestyle='-')
for cap in f['caps']:
cap.set(color='gray', linewidth=2)
for median in f['medians']:
median.set(color='DarkBlue', linewidth=2)
for flier in f['fliers']:
flier.set(marker='o', color='y', alpha=0.5)
plt.show()
draw(df3)
# boxes, 箱线
# medians, 中位值的横线,
# whiskers, 从box到error bar之间的竖线.
# fliers, 异常值
# caps, error bar横线
# means, 均值的横线