刚开始学python,选了这个题目,把代码放上来留念,没有用到很流行的框架,所以代码量挺大
GUI用wxpython写的
# _*_ coding: UTF-8 _*_
import os
import re
import requests
import sys
import wx
import traceback
from datetime import datetime
from datetime import timedelta
from lxml import etree
import data_analysis
global file_path
file_path = ''
class Wb(wx.App):
def Operate(self):
self.cookie = {}
self.username = '' # 用户名,如“Dear-迪丽热巴”
self.Number = 0 # 用户全部微博数
self.number1 = 0 # 爬取到的微博数
self.Guanzhu = 0
self.fans = 0
self.Content = []
self.star = [] # 微博对应的点赞
self.Pinglun = [] # 微博对应的评论数
self.publish_tool = []
self.Id = 0000
###======================================================================================================
###======================================GUI=============================================================
# 建立一个窗口和frame控件
self.frame_operate = wx.Frame(
None, title="Weibo_Spider_GUI", size=(500, 500))
self.panel_operate = wx.Panel(self.frame_operate, -1)
# 设置字体格式
self.font1 = wx.Font(18, wx.ROMAN, wx.ITALIC, wx.NORMAL)
self.label1 = wx.StaticText(
self.panel_operate, -1, "WeiBo Spider", pos=(180, 60), style=wx.ALIGN_CENTER)
self.label1.SetFont(self.font1)
# cookie的标签和文本框
self.label2 = wx.StaticText(
self.panel_operate, -1, "请输入您微博登陆的有效cookie", pos=(160, 130), style=wx.ALIGN_CENTER)
self.textCookie = wx.TextCtrl(
self.panel_operate, -1, pos=(200, 150), size=(80, 20), style=wx.TE_CENTER)
# 获取所爬取用户的self.Id
self.label3 = wx.StaticText(
self.panel_operate, -1, "请输入您所要爬取微博账号的self.Id", pos=(160, 180), style=wx.ALIGN_CENTER)
self.textId = wx.TextCtrl(
self.panel_operate, -1, pos=(200, 200), size=(80, 20), style=wx.TE_CENTER)
# 文件存储路径
self.label4 = wx.StaticText(self.panel_operate,-1,"数据文件保存路径", pos=(160,230),style=wx.ALIGN_CENTER)
self.textFile_path = wx.TextCtrl(self.panel_operate,-1,pos=(200,250),size=(80,20),style=wx.TE_CENTER)
# 微博的正式UI界面
def get_cookie(self,event):
self.cookie = {"Cookie": self.textCookie.GetValue()}
self.Id=int(self.textId.GetValue())
global file_path
file_path = self.textFile_path.GetValue()+os.sep+"%d" % self.Id + ".txt"
self.Onbutton_Start()
def Onbutton_Start(self):
self.GetName() #获取用户名
self.GetSimple_Info() # 获取微博数,转发量,关注数,粉丝数
self.weibo_para()
self.write_txt()
self.weibo_UI1()
def weibo_UI1(self):
message = "文件爬取完毕"
wx.MessageBox(message)
self.weibo_UI2()
def weibo_UI2(self):
self.frame_operate.Destroy()
self.frame_Info = wx.Frame(None,title="User_Information",size=(500,500))
self.panel_Info = wx.Panel(self.frame_Info,-1)
t1 = "用户昵称:" + str(self.username)
t2 = "微博数:" + str(self.Number)
t3 = "粉丝数:"+str(self.fans)
t4 = "关注数:"+str(self.Guanzhu)
self.label16 = wx.StaticText(self.panel_Info,-1,self.username,pos=(200,100),style=wx.ALIGN_LEFT)
self.label5 = wx.StaticText(self.panel_Info,-1,t1,pos=(180,130),style=wx.ALIGN_LEFT)
self.label13 = wx.StaticText(self.panel_Info,-1,t2,pos=(180,150),style=wx.ALIGN_LEFT)
self.label14 = wx.StaticText(self.panel_Info,-1,t3,pos=(180,170),style=wx.ALIGN_LEFT)
self.label15 = wx.StaticText(self.panel_Info,-1,t4,pos=(180,190),style=wx.ALIGN_LEFT)
self.font2 = wx.Font(13,wx.SCRIPT,wx.ITALIC,wx.NORMAL) #小字体 font1大字体
self.label16.SetFont(self.font1)
self.label5.SetFont(self.font2)
self.label13.SetFont(self.font2)
self.label14.SetFont(self.font2)
self.label15.SetFont(self.font2)
self.button_news = wx.Button(self.panel_Info,-1,"查看最近微博",pos=(220,280))
self.Bind(wx.EVT_BUTTON,self.weibo_UI3 ,self.button_news)
self.frame_Info.Show()
# 最进微博
def weibo_UI3(self,event):
self.frame_Info.Destroy()
self.frame_news = wx.Frame(None,title="---",size=(500,500))
self.panel_news = wx.Panel(self.frame_news,-1)
label18 = wx.StaticText(self.panel_news,-1,"最新微博动态",pos=(200,40))
if self.Content:
text1 = "最新/置顶 微博为: " + self.Content[0]
text2 = "最新/置顶 微博发布工具: " + self.publish_tool[0]
text3 = "最新/置顶 微博发布时间: " + self.Time[0]
text4 = "最新/置顶 微博获得赞数: " + str(self.star[0])
text5 = "最新/置顶 微博获得转发数: " + str(self.Zhuanfa[0])
text6 = "最新/置顶 微博获得评论数: " + str(self.Pinglun[0])
self.label6 = wx.TextCtrl(self.panel_news,-1,text1,pos=(90,60),size=(250,140), style=wx.TE_MULTILINE|wx.TE_RICH)
self.label7 = wx.StaticText(self.panel_news,-1,text2,pos=(90,200),style=wx.ALIGN_LEFT)
self.label8 = wx.StaticText(self.panel_news,-1,text3,pos=(90,220),style=wx.ALIGN_LEFT)
self.label9 = wx.StaticText(self.panel_news,-1,text4,pos=(90,240),style=wx.ALIGN_LEFT)
self.label10 = wx.StaticText(self.panel_news,-1,text5,pos=(90,260),style=wx.ALIGN_LEFT)
self.label11 = wx.StaticText(self.panel_news,-1,text6,pos=(90,280),style=wx.ALIGN_LEFT)
# 查看微博信息
self.Button_info = wx.Button(self.panel_news,-1,"点击查看之前的微博内容",pos=(220,340))
self.Bind(wx.EVT_BUTTON,self.weibo_pre_info,self.Button_info)
# 查看爬虫信息的文档
self.Button_file = wx.Button(self.panel_news,-1,"点击查看微博数据分析图表",pos=(220,380))
self.Bind(wx.EVT_BUTTON,self.analysis_UI,self.Button_file)
self.frame_news.Show()
def analysis_UI(self,event):
self.frame_data = wx.Frame(None,title="data_analysis--20177830115",size=(500,500))
self.panel_data = wx.Panel(self.frame_data,-1)
text1 = "2017-2018微博转发/点赞量折线统计图"
text2 = '原创微博与转发微博统计图'
text3 = '微博发布工具统计图'
text4 = '微博使用心情统计图'
self.button_1 = wx.Button(self.panel_data,-1,text1,pos=(180,120))
self.button_2 = wx.Button(self.panel_data,-1,text2,pos=(180,160))
self.button_3 = wx.Button(self.panel_data,-1,text3,pos=(180,200))
self.button_4 = wx.Button(self.panel_data,-1,text4,pos=(180,240))
self.Bind(wx.EVT_BUTTON,self.figure_1,self.button_1)
self.Bind(wx.EVT_BUTTON,self.figure_2,self.button_2)
self.Bind(wx.EVT_BUTTON,self.figure_3,self.button_3)
self.Bind(wx.EVT_BUTTON,self.figure_4,self.button_4)
self.frame_data.Show()
def figure_1(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_Zhexian()
def figure_2(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_YC()
def figure_3(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_GJ()
def figure_4(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_XQ()
def weibo_pre_info(self,event): ## 过度函数,为了让不断进入weibo_info函数中(分条输出)不报错。(多次进入没有event触发)
self.weibo_info()
def weibo_info(self):
#flag = 1#计次函数,flag==1,继续循环,flag==0退出循环,即不展示下一条微博 ## 这坑爹玩意根本不能用for循环,所以我只能不断进入函数
self.s = wx.Frame(None,title="---",size=(500,500))
self.f = wx.Panel(self.s,-1)
#for i in range(1,self.Number+1):
text1 = str(self.a+1)+":" + self.Content[self.a]
text2 = "发布工具: " + self.publish_tool[self.a]
text3 = "发布时间: " + self.Time[self.a]
text4 = "点赞数: " + str(self.star[self.a])
text5 = "转发数: " + str(self.Zhuanfa[self.a])
text6 = "评论数: " + str(self.Pinglun[self.a])
self.labela = wx.TextCtrl (self.f,-1,text1,pos=(80, 60),size=(250,140),style=wx.TE_MULTILINE|wx.TE_RICH)
self.labelb = wx.StaticText(self.f,-1,text2,pos=(80,200),style=wx.ALIGN_LEFT)
self.labelc = wx.StaticText(self.f,-1,text3,pos=(80,220),style=wx.ALIGN_LEFT)
self.labeld = wx.StaticText(self.f,-1,text4,pos=(80,240),style=wx.ALIGN_LEFT)
self.labele = wx.StaticText(self.f,-1,text5,pos=(80,260),style=wx.ALIGN_LEFT)
self.labelf = wx.StaticText(self.f,-1,text6,pos=(80,280),style=wx.ALIGN_LEFT)
self.button_next=wx.Button(self.f,-1,"查看下一条",pos=(300,380))
self.button_exit=wx.Button(self.f,-1,"关闭",pos=(100,380))
self.Bind(wx.EVT_BUTTON,self.exit,self.button_exit)
self.Bind(wx.EVT_BUTTON,self.cont,self.button_next)
self.s.Show()
def exit(self,event):
self.s.Destroy()
def cont(self,event):
self.a += 1
self.s.Destroy()
self.weibo_info()
具体爬虫部分,参考github某大佬的
# 获取用户昵称
def GetName(self):
url = "https://weibo.cn/%d/info" % (self.Id)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html)
username = selector.xpath("//title/text()")[0]
self.username = username[:-3]
def GetSimple_Info(self):
url = "https://weibo.cn/u/%d?&page=1" % (self.Id)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html) # 转化为标准的HTML
pattern = r"\d+\.?\d*"
# 微博数
wb_num = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")[0]# 微博[1543] 
regx = re.findall(pattern, wb_num, re.S | re.M) # 只要数字(字符)
for value in regx:
num_wb = int(value)
break
self.Number = num_wb
# 关注数
str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
regx = re.findall(pattern, str_gz, re.M)
self.Guanzhu = int(regx[0])
# 粉丝数
# 获取"长微博"全部文字内容
def GetLong(self, weibo_link):
html = requests.get(weibo_link, cookies=self.cookie).content
selector = etree.HTML(html)
info = selector.xpath("//div[@class='c']")[1]
wb_content = info.xpath("div/span[@class='ctt']")[0].xpath(
"string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode(
sys.stdout()
return wb_content
# 获取转发微博信息
def GetZhuanfa(self, is_retweet, info, wb_content):
original_user = is_retweet[0].xpath("a/text()")
if not original_user:
wb_content = u"转发微博已被删除"
return wb_content
else:
original_user = original_user[0]
retweet_reason = info.xpath("div")[-1].xpath("string(.)").replace(u"\u200b", "").encode(
sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")]
wb_content = (retweet_reason + "\n" + u"原始用户: " +
original_user + "\n" + u"转发内容: " + wb_content)
return wb_content
#一个界面展示一条微博的发布时间、点赞数、转发数、评论数
def weibo_para(self):
url = "https://weibo.cn/u/%d?&page=1" % (self.Id)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html)
if selector.xpath("//input[@name='mp']") == []:
page_num = 1
else:
page_num = (int)(selector.xpath(
"//input[@name='mp']")[0].attrib["value"])
pattern = r"\d+\.?\d*"
for page in range(1, page_num + 1):
url2 = "https://weibo.cn/u/%d?&page=%d" % (
self.Id, page)
html2 = requests.get(url2, cookies=self.cookie).content
selector2 = etree.HTML(html2)
info = selector2.xpath("//div[@class='c']")
is_empty = info[0].xpath("div/span[@class='ctt']")
if is_empty:
for i in range(0, len(info) - 2):
# 微博内容
str_t = info[i].xpath("div/span[@class='ctt']")
Content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode(
sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
Content = Content[:-1]
weibo_Id = info[i].xpath("@id")[0][2:]
a_link = info[i].xpath(
"div/span[@class='ctt']/a")
is_retweet = info[i].xpath("div/span[@class='cmt']")
if a_link:
if a_link[-1].xpath("text()")[0] == u"全文":
if not is_retweet:
wb_content = wb_content[1:]
Content = wb_content
if is_retweet:
Content = self.GetZhuanfa(
is_retweet, info[i], Content)
self.Content.append(Content)
# 微博发布时间
str_time = info[i].xpath("div/span[@class='ct']")
str_time = str_time[0].xpath("string(.)").encode(sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
Time = str_time.split(u'来自')[0]
if u"刚刚" in Time:
Time = datetime.now().strftime('%Y-%m-%d %H:%M')
elif u"分钟" in Time:
minute = Time[:Time.find(u"分钟")]
minute = timedelta(minutes=int(minute))
Time = (datetime.now() - minute).strftime("%Y-%m-%d %H:%M")
elif u"今天" in Time:
today = datetime.now().strftime("%Y-%m-%d")
time = Time[3:]
Time = today + " " + time
elif u"月" in Time:
year = datetime.now().strftime("%Y")
month = Time[0:2]
day = Time[3:5]
time = Time[7:12]
Time = (
year + "-" + month + "-" + day + " " + time)
else:
Time = Time[:16]
self.Time.append(Time)
str_footer = info[i].xpath("div")[-1]
str_footer = str_footer.xpath("string(.)").encode(
sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)
# 微博发布工具
if len(str_time.split(u'来自')) > 1:
publish_tool = str_time.split(u'来自')[1]
else:
publish_tool = u"无"
self.publish_tool.append(publish_tool)
str_footer = info[i].xpath("div")[-1]
str_footer = str_footer.xpath("string(.)").encode(
sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)
str_footer = str_footer[str_footer.rfind(u'赞'):]
guid = re.findall(pattern, str_footer, re.M)
# 点赞数
star = int(regx[0])
self.star.append(star)
# 转发数
Zhuanfa = int(regx[1])
self.Zhuanfa.append(Zhuanfa)
# 评论数
Pinglun = int(regx[2])
self.Pinglun.append(Pinglun)
self.number1 += 1
def write_txt(self):
try:
contents_header = u"\n\n微博内容: \n"
contents = (u"用户信息\n用户昵称:" + self.username +
u"\n用户Id: " + str(self.Id) +
u"\n微博数: " + str(self.Number) +
u"\n关注数: " + str(self.Guanzhu) +
u"\n粉丝数: " + str(self.fans) + contents_header + '\n')
for i in range(1, self.number1 + 1):
text = (str(i) + ":" + self.Content[i - 1] + "\n" +
u"发布工具: " + self.publish_tool[i - 1] + "\n" +
u"发布时间: " + self.Time[i - 1] + "\n" +
u"点赞数: " + str(self.star[i - 1]) +
u"转发数: " + str(self.Zhuanfa[i - 1]) +
u"评论数: " + str(self.Pinglun[i - 1]) + "\n\n")
contents = contents + text
global file_path
f = open(file_path, "wb")
f.write(contents.encode(sys.stdout.encoding))
f.close()
except Exception as e:
print("Error: ", e)
traceback.print_exc()
测试函数部分
def main():
weibo = Wb()
weibo.Operate()
weibo.MainLoop()
if __name__ == "__main__":
main()
数据分析
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
from matplotlib import font_manager as fm
import time
from datetime import datetime
import webbrowser
class analysis(object):
def __init__(self,file_name,number):
self.file_name = file_name
self.number = number
self.X_data = []
self.Y1_data = []
self.Y_data = []
self.str = ""
## 折线图展示窗口
def analyse_Zhexian(self):
pattern = re.compile(r'转发数: \d+')
pattern1 = re.compile(r'\d+')
pattern2 = re.compile(r'发布时间: (\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})')#提取时间
pattern3 = re.compile(r'.*2016.*')
pattern4 = re.compile(r'点赞数: \d+')
with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
# 用正则表达式提取所需数据
result = pattern.findall(str)
ls3 = ''.join(result)
Result = pattern1.findall(ls3)
Num_Zhuanfa = [ int(x) for x in Result ]
result1 = pattern4.findall(str)
ls1 = ''.join(result1)
Result1 = pattern1.findall(ls1)
Num_Dianzan = [int(x) for x in Result1]
Num_Zhuanfa_time = pattern2.findall(str)
for i in range(0,len(Num_Zhuanfa_time)):
if pattern3.findall(Num_Zhuanfa_time[i]):
stop = i
break
Num_Zhuanfa = Num_Zhuanfa[0:stop:1]
Num_Dianzan = Num_Dianzan[0:stop:1]
# 数据除以1000,画图更美观
for i in range(0,len(Num_Zhuanfa)):
Num_Zhuanfa[i] = Num_Zhuanfa[i] /1000
for i in range(0,len(Num_Dianzan)):
Num_Dianzan[i] = Num_Dianzan[i] /1000
#将时间转化为时间戳再转化为datetime类型
aa=[time.strptime(i, "%Y-%m-%d %H:%M") for i in Num_Zhuanfa_time]
timeStamp = [int(time.mktime(a)) for a in aa]
Num_Zhuanfa_time=[datetime.fromtimestamp(k) for k in timeStamp]
# 处理数据量过多的问题
number = len(Num_Zhuanfa)
Group = int(0.18 * number)
k = number // Group
for i in range(0,Group):
self.X_data.append(Num_Zhuanfa_time[i*k])
self.Y1_data.append(Num_Dianzan[i*k])
fig1 = plt.figure(figsize=(8,5))
plt.rcParams['font.sans-serif'] = ['SimHei']
ax1 = fig1.add_subplot(1,1,1)
ax1.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d %H-%M'))
plt.xticks(self.X_data,rotation=90)
plt.yticks(np.linspace(0,5000,5,endpoint=True))
plt.title(u"2017-2018微博转发/点赞量折线图",color="black")
plt.plot(self.X_data,self.Y_data,"o-",color='skyblue',label="转发量",markersize=1.5)
plt.plot(self.X_data,self.Y1_data,"o-",color='pink',label="点赞量",markersize=1.5)
plt.xlabel("发布时间")
plt.ylabel("数量(千/条)")
plt.legend()
plt.show()
def analyse_YC(self):
pattern = re.compile(r'转发理由')
with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
Zhuanfa = pattern.findall(str)
Number_Zhuanfa = int(len(Zhuanfa))
Yuanchuang = self.number - Number_Zhuanfa
plt.rcParams['font.sans-serif'] = ['SimHei']
labels = ['转发微博','原创微博']
sizes = [Number_Zhuanfa,Yuanchuang]
explode= (0.1,0)
plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=False,startangle=150)
plt.title(u"原创与转发微博量",color="black")
plt.show()
def analyse_GJ(self):
pattern = re.compile(r'发布工具: (.*)\n发布时间')
with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
number_GJ = pattern.findall(str)
#print(number_GJ)
gongju = dict()
for i in number_GJ:
name = i
if name in gongju:
gongju[name]+=1
else:
gongju[name]=1
for key in list(gongju.keys()):
if gongju[key]<=10:
del gongju[key]
labels = list(gongju.keys())
sizes = list(gongju.values())
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.pie(sizes,labels=labels,autopct='%1.1f%%',shadow=True,startangle=150)
plt.title(u"微博发布工具统计",color="black")
plt.show()
def analyse_XQ(self):
pattern = re.compile(r'\[(.{1,4})\].*\[(.{1,4})\]')
with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
number_XQ = pattern.findall(str)
# print(number_XQ)
a=[]
for i in range(0,len(number_XQ)):
for j in (range(0,len(number_XQ[i]))):
a.append(number_XQ[i][j])
biaoqing = dict()
for i in a:
name = i
if name in biaoqing:
biaoqing[name]+=1
else:
biaoqing[name]=1
for key in list(biaoqing.keys()):
if biaoqing[key] <= 2:
del biaoqing[key]
labels = list(biaoqing.keys())
sizes = list(biaoqing.values())
fig1, ax1 = plt.subplots()
patches, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.0f%%',
shadow=False, startangle=170)
ax1.axis('equal')
#重新设置字体大小
plt.rcParams['font.sans-serif'] = ['SimHei']
proptease = fm.FontProperties()
proptease.set_size('small')
plt.title(u"微博表情使用次数",color="black")
plt.setp(autotexts, fontproperties=proptease)
plt.setp(texts, fontproperties=proptease)
plt.show()
python程序打包
#在cmd下安装pyinstaller
pip install pyinstaller
#打包成一个可执行文件 -F (注意将cmd窗口切换至文件保存的路径下)
pyinstaller -F filename.py
本篇只适合新手简单学习,笔者也刚学,加上复习周,后期会逐渐完善,毕竟UI写的太丑了 !
另:关于获取本地用户cookie和微博账号的id操作比较简单在此不再做详细解释。如果程序跑不出来相信我一定是cookie问题
仅供作业参考,抄袭需谨慎