微博爬虫及简单数据分析

刚开始学python,选了这个题目,把代码放上来留念,没有用到很流行的框架,所以代码量挺大
GUI用wxpython写的

# _*_ coding: UTF-8 _*_
import os
import re
import requests
import sys
import wx
import traceback
from datetime import datetime
from datetime import timedelta
from lxml import etree
import data_analysis

global file_path
file_path = ''
class Wb(wx.App):
    def Operate(self):
        self.cookie   = {}
        self.username = ''  # 用户名,如“Dear-迪丽热巴”
        self.Number   = 0  # 用户全部微博数
        self.number1  = 0  # 爬取到的微博数
        self.Guanzhu  = 0
        self.fans     = 0
        self.Content  = []  
        self.star     = []  # 微博对应的点赞
        self.Pinglun  = []  # 微博对应的评论数
        self.publish_tool = []
        self.Id       = 0000
###======================================================================================================
###======================================GUI=============================================================

        # 建立一个窗口和frame控件
        self.frame_operate = wx.Frame(
            None, title="Weibo_Spider_GUI", size=(500, 500))
        self.panel_operate = wx.Panel(self.frame_operate, -1)

        # 设置字体格式
        self.font1 = wx.Font(18, wx.ROMAN, wx.ITALIC, wx.NORMAL)
        self.label1 = wx.StaticText(
            self.panel_operate, -1, "WeiBo Spider", pos=(180, 60), style=wx.ALIGN_CENTER)
        self.label1.SetFont(self.font1)

        # cookie的标签和文本框
        self.label2 = wx.StaticText(
            self.panel_operate, -1, "请输入您微博登陆的有效cookie", pos=(160, 130), style=wx.ALIGN_CENTER)
        self.textCookie = wx.TextCtrl(
            self.panel_operate, -1, pos=(200, 150), size=(80, 20), style=wx.TE_CENTER)

        # 获取所爬取用户的self.Id
        self.label3 = wx.StaticText(
            self.panel_operate, -1, "请输入您所要爬取微博账号的self.Id", pos=(160, 180), style=wx.ALIGN_CENTER)
        self.textId = wx.TextCtrl(
            self.panel_operate, -1, pos=(200, 200), size=(80, 20), style=wx.TE_CENTER)

        # 文件存储路径
        self.label4 = wx.StaticText(self.panel_operate,-1,"数据文件保存路径", pos=(160,230),style=wx.ALIGN_CENTER)
        self.textFile_path = wx.TextCtrl(self.panel_operate,-1,pos=(200,250),size=(80,20),style=wx.TE_CENTER)
       
    # 微博的正式UI界面
    def get_cookie(self,event):  
        self.cookie = {"Cookie": self.textCookie.GetValue()}
        self.Id=int(self.textId.GetValue())
        global file_path
        file_path  = self.textFile_path.GetValue()+os.sep+"%d" % self.Id + ".txt"
        self.Onbutton_Start()         

    def Onbutton_Start(self):
        self.GetName() #获取用户名
        self.GetSimple_Info() # 获取微博数,转发量,关注数,粉丝数
        self.weibo_para()
        self.write_txt() 
        self.weibo_UI1()

    def weibo_UI1(self):
        message = "文件爬取完毕"
        wx.MessageBox(message)
        self.weibo_UI2()

    def weibo_UI2(self):
        self.frame_operate.Destroy()
        self.frame_Info = wx.Frame(None,title="User_Information",size=(500,500))
        self.panel_Info = wx.Panel(self.frame_Info,-1)
        t1 = "用户昵称:" + str(self.username)
        t2 =  "微博数:" + str(self.Number) 
        t3 = "粉丝数:"+str(self.fans)
        t4 = "关注数:"+str(self.Guanzhu) 
        self.label16 = wx.StaticText(self.panel_Info,-1,self.username,pos=(200,100),style=wx.ALIGN_LEFT)
        self.label5  = wx.StaticText(self.panel_Info,-1,t1,pos=(180,130),style=wx.ALIGN_LEFT)
        self.label13 = wx.StaticText(self.panel_Info,-1,t2,pos=(180,150),style=wx.ALIGN_LEFT)
        self.label14 = wx.StaticText(self.panel_Info,-1,t3,pos=(180,170),style=wx.ALIGN_LEFT)
        self.label15 = wx.StaticText(self.panel_Info,-1,t4,pos=(180,190),style=wx.ALIGN_LEFT)
        self.font2   = wx.Font(13,wx.SCRIPT,wx.ITALIC,wx.NORMAL) #小字体 font1大字体
        self.label16.SetFont(self.font1)
        self.label5.SetFont(self.font2)
        self.label13.SetFont(self.font2)
        self.label14.SetFont(self.font2)
        self.label15.SetFont(self.font2)

        self.button_news = wx.Button(self.panel_Info,-1,"查看最近微博",pos=(220,280))
        self.Bind(wx.EVT_BUTTON,self.weibo_UI3 ,self.button_news)
        self.frame_Info.Show()

        # 最进微博
    def weibo_UI3(self,event):
        self.frame_Info.Destroy()
        self.frame_news = wx.Frame(None,title="---",size=(500,500))
        self.panel_news = wx.Panel(self.frame_news,-1)  
        label18 = wx.StaticText(self.panel_news,-1,"最新微博动态",pos=(200,40))      
        if self.Content:
            text1 = "最新/置顶 微博为: " + self.Content[0]
            text2 = "最新/置顶 微博发布工具: " + self.publish_tool[0]
            text3 = "最新/置顶 微博发布时间: " + self.Time[0]
            text4 = "最新/置顶 微博获得赞数: " + str(self.star[0])
            text5 = "最新/置顶 微博获得转发数: " + str(self.Zhuanfa[0])
            text6 = "最新/置顶 微博获得评论数: " + str(self.Pinglun[0])

            self.label6  = wx.TextCtrl(self.panel_news,-1,text1,pos=(90,60),size=(250,140), style=wx.TE_MULTILINE|wx.TE_RICH)
            self.label7  = wx.StaticText(self.panel_news,-1,text2,pos=(90,200),style=wx.ALIGN_LEFT)
            self.label8  = wx.StaticText(self.panel_news,-1,text3,pos=(90,220),style=wx.ALIGN_LEFT)
            self.label9  = wx.StaticText(self.panel_news,-1,text4,pos=(90,240),style=wx.ALIGN_LEFT)
            self.label10 = wx.StaticText(self.panel_news,-1,text5,pos=(90,260),style=wx.ALIGN_LEFT)
            self.label11 = wx.StaticText(self.panel_news,-1,text6,pos=(90,280),style=wx.ALIGN_LEFT)
        

        # 查看微博信息
        self.Button_info = wx.Button(self.panel_news,-1,"点击查看之前的微博内容",pos=(220,340))
        self.Bind(wx.EVT_BUTTON,self.weibo_pre_info,self.Button_info)
        # 查看爬虫信息的文档
        self.Button_file = wx.Button(self.panel_news,-1,"点击查看微博数据分析图表",pos=(220,380))
        self.Bind(wx.EVT_BUTTON,self.analysis_UI,self.Button_file)
        self.frame_news.Show()


    def analysis_UI(self,event):
        self.frame_data = wx.Frame(None,title="data_analysis--20177830115",size=(500,500))
        self.panel_data = wx.Panel(self.frame_data,-1) 
        text1 = "2017-2018微博转发/点赞量折线统计图"
        text2 = '原创微博与转发微博统计图' 
        text3 = '微博发布工具统计图'
        text4 = '微博使用心情统计图' 
        self.button_1 = wx.Button(self.panel_data,-1,text1,pos=(180,120))     
        self.button_2 = wx.Button(self.panel_data,-1,text2,pos=(180,160)) 
        self.button_3 = wx.Button(self.panel_data,-1,text3,pos=(180,200)) 
        self.button_4 = wx.Button(self.panel_data,-1,text4,pos=(180,240))
        self.Bind(wx.EVT_BUTTON,self.figure_1,self.button_1)
        self.Bind(wx.EVT_BUTTON,self.figure_2,self.button_2)
        self.Bind(wx.EVT_BUTTON,self.figure_3,self.button_3)
        self.Bind(wx.EVT_BUTTON,self.figure_4,self.button_4)
        self.frame_data.Show()

    def figure_1(self,event):
        global file_path
        figure = data_analysis.analysis(file_path,self.Number)
        figure.analyse_Zhexian()

    def figure_2(self,event):
        global file_path
        figure = data_analysis.analysis(file_path,self.Number)
        figure.analyse_YC()     

    def figure_3(self,event):
        global file_path
        figure = data_analysis.analysis(file_path,self.Number)
        figure.analyse_GJ()  

    def figure_4(self,event):
        global file_path
        figure = data_analysis.analysis(file_path,self.Number)
        figure.analyse_XQ()
        

    def weibo_pre_info(self,event): ## 过度函数,为了让不断进入weibo_info函数中(分条输出)不报错。(多次进入没有event触发)
        self.weibo_info()

    def weibo_info(self):
            #flag = 1#计次函数,flag==1,继续循环,flag==0退出循环,即不展示下一条微博 ## 这坑爹玩意根本不能用for循环,所以我只能不断进入函数
            self.s = wx.Frame(None,title="---",size=(500,500))
            self.f = wx.Panel(self.s,-1) 
            #for i in range(1,self.Number+1):
            text1 = str(self.a+1)+":" + self.Content[self.a]
            text2 = "发布工具: " + self.publish_tool[self.a]
            text3 = "发布时间: " + self.Time[self.a]
            text4 = "点赞数: " + str(self.star[self.a])
            text5 = "转发数: " + str(self.Zhuanfa[self.a])
            text6 = "评论数: " + str(self.Pinglun[self.a])
            

            self.labela = wx.TextCtrl  (self.f,-1,text1,pos=(80, 60),size=(250,140),style=wx.TE_MULTILINE|wx.TE_RICH) 
            self.labelb = wx.StaticText(self.f,-1,text2,pos=(80,200),style=wx.ALIGN_LEFT)
            self.labelc = wx.StaticText(self.f,-1,text3,pos=(80,220),style=wx.ALIGN_LEFT)
            self.labeld = wx.StaticText(self.f,-1,text4,pos=(80,240),style=wx.ALIGN_LEFT)
            self.labele = wx.StaticText(self.f,-1,text5,pos=(80,260),style=wx.ALIGN_LEFT)
            self.labelf = wx.StaticText(self.f,-1,text6,pos=(80,280),style=wx.ALIGN_LEFT)

            self.button_next=wx.Button(self.f,-1,"查看下一条",pos=(300,380))
            self.button_exit=wx.Button(self.f,-1,"关闭",pos=(100,380))

            self.Bind(wx.EVT_BUTTON,self.exit,self.button_exit)
            self.Bind(wx.EVT_BUTTON,self.cont,self.button_next)
            self.s.Show()
            
    def exit(self,event):
        self.s.Destroy()

    def cont(self,event):
        self.a += 1
        self.s.Destroy()
        self.weibo_info()


具体爬虫部分,参考github某大佬的


    # 获取用户昵称
    def GetName(self):
        url      = "https://weibo.cn/%d/info" % (self.Id)
        html     = requests.get(url, cookies=self.cookie).content
        selector = etree.HTML(html)  
        username = selector.xpath("//title/text()")[0]
        self.username = username[:-3]  

    def GetSimple_Info(self):

        url      = "https://weibo.cn/u/%d?&page=1" % (self.Id)
        html     = requests.get(url, cookies=self.cookie).content
        selector = etree.HTML(html)  # 转化为标准的HTML
        pattern  = r"\d+\.?\d*"      
        
        # 微博数
        wb_num = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")[0]# 
微博[1543]  regx = re.findall(pattern, wb_num, re.S | re.M) # 只要数字(字符) for value in regx: num_wb = int(value) break self.Number = num_wb # 关注数 str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0] regx = re.findall(pattern, str_gz, re.M) self.Guanzhu = int(regx[0]) # 粉丝数 # 获取"长微博"全部文字内容 def GetLong(self, weibo_link): html = requests.get(weibo_link, cookies=self.cookie).content selector = etree.HTML(html) info = selector.xpath("//div[@class='c']")[1] wb_content = info.xpath("div/span[@class='ctt']")[0].xpath( "string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode( sys.stdout() return wb_content # 获取转发微博信息 def GetZhuanfa(self, is_retweet, info, wb_content): original_user = is_retweet[0].xpath("a/text()") if not original_user: wb_content = u"转发微博已被删除" return wb_content else: original_user = original_user[0] retweet_reason = info.xpath("div")[-1].xpath("string(.)").replace(u"\u200b", "").encode( sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] wb_content = (retweet_reason + "\n" + u"原始用户: " + original_user + "\n" + u"转发内容: " + wb_content) return wb_content #一个界面展示一条微博的发布时间、点赞数、转发数、评论数 def weibo_para(self): url = "https://weibo.cn/u/%d?&page=1" % (self.Id) html = requests.get(url, cookies=self.cookie).content selector = etree.HTML(html) if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: page_num = (int)(selector.xpath( "//input[@name='mp']")[0].attrib["value"]) pattern = r"\d+\.?\d*" for page in range(1, page_num + 1): url2 = "https://weibo.cn/u/%d?&page=%d" % ( self.Id, page) html2 = requests.get(url2, cookies=self.cookie).content selector2 = etree.HTML(html2) info = selector2.xpath("//div[@class='c']") is_empty = info[0].xpath("div/span[@class='ctt']") if is_empty: for i in range(0, len(info) - 2): # 微博内容 str_t = info[i].xpath("div/span[@class='ctt']") Content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode( sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) Content = Content[:-1] weibo_Id = info[i].xpath("@id")[0][2:] a_link = info[i].xpath( "div/span[@class='ctt']/a") is_retweet = info[i].xpath("div/span[@class='cmt']") if a_link: if a_link[-1].xpath("text()")[0] == u"全文": if not is_retweet: wb_content = wb_content[1:] Content = wb_content if is_retweet: Content = self.GetZhuanfa( is_retweet, info[i], Content) self.Content.append(Content) # 微博发布时间 str_time = info[i].xpath("div/span[@class='ct']") str_time = str_time[0].xpath("string(.)").encode(sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) Time = str_time.split(u'来自')[0] if u"刚刚" in Time: Time = datetime.now().strftime('%Y-%m-%d %H:%M') elif u"分钟" in Time: minute = Time[:Time.find(u"分钟")] minute = timedelta(minutes=int(minute)) Time = (datetime.now() - minute).strftime("%Y-%m-%d %H:%M") elif u"今天" in Time: today = datetime.now().strftime("%Y-%m-%d") time = Time[3:] Time = today + " " + time elif u"月" in Time: year = datetime.now().strftime("%Y") month = Time[0:2] day = Time[3:5] time = Time[7:12] Time = ( year + "-" + month + "-" + day + " " + time) else: Time = Time[:16] self.Time.append(Time) str_footer = info[i].xpath("div")[-1] str_footer = str_footer.xpath("string(.)").encode( sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) # 微博发布工具 if len(str_time.split(u'来自')) > 1: publish_tool = str_time.split(u'来自')[1] else: publish_tool = u"无" self.publish_tool.append(publish_tool) str_footer = info[i].xpath("div")[-1] str_footer = str_footer.xpath("string(.)").encode( sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) str_footer = str_footer[str_footer.rfind(u'赞'):] guid = re.findall(pattern, str_footer, re.M) # 点赞数 star = int(regx[0]) self.star.append(star) # 转发数 Zhuanfa = int(regx[1]) self.Zhuanfa.append(Zhuanfa) # 评论数 Pinglun = int(regx[2]) self.Pinglun.append(Pinglun) self.number1 += 1 def write_txt(self): try: contents_header = u"\n\n微博内容: \n" contents = (u"用户信息\n用户昵称:" + self.username + u"\n用户Id: " + str(self.Id) + u"\n微博数: " + str(self.Number) + u"\n关注数: " + str(self.Guanzhu) + u"\n粉丝数: " + str(self.fans) + contents_header + '\n') for i in range(1, self.number1 + 1): text = (str(i) + ":" + self.Content[i - 1] + "\n" + u"发布工具: " + self.publish_tool[i - 1] + "\n" + u"发布时间: " + self.Time[i - 1] + "\n" + u"点赞数: " + str(self.star[i - 1]) + u"转发数: " + str(self.Zhuanfa[i - 1]) + u"评论数: " + str(self.Pinglun[i - 1]) + "\n\n") contents = contents + text global file_path f = open(file_path, "wb") f.write(contents.encode(sys.stdout.encoding)) f.close() except Exception as e: print("Error: ", e) traceback.print_exc()

测试函数部分

def main():
    weibo = Wb()
    weibo.Operate()
    weibo.MainLoop() 

if __name__ == "__main__":
    main()

数据分析

import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
from matplotlib import font_manager as fm
import time
from datetime import datetime
import webbrowser

class analysis(object):

    def __init__(self,file_name,number):
        self.file_name = file_name
        self.number    = number
        self.X_data    = []
        self.Y1_data   = []
        self.Y_data    = []
        self.str       = ""

    ## 折线图展示窗口
    def analyse_Zhexian(self):
        pattern    = re.compile(r'转发数: \d+')  
        pattern1   = re.compile(r'\d+')
        pattern2   = re.compile(r'发布时间: (\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})')#提取时间
        pattern3   = re.compile(r'.*2016.*')
        pattern4   = re.compile(r'点赞数: \d+')

        with open(self.file_name,encoding = "utf-8") as f:
            str    = f.read()
        # 用正则表达式提取所需数据
        result           = pattern.findall(str)
        ls3              = ''.join(result)
        Result           = pattern1.findall(ls3)
        Num_Zhuanfa      = [ int(x) for x in Result ] 
        result1           = pattern4.findall(str)
        ls1               = ''.join(result1)
        Result1           = pattern1.findall(ls1)
        Num_Dianzan      = [int(x) for x in Result1]
        Num_Zhuanfa_time = pattern2.findall(str) 
        for i in range(0,len(Num_Zhuanfa_time)):
            if pattern3.findall(Num_Zhuanfa_time[i]):
                stop = i  
                break

        Num_Zhuanfa      = Num_Zhuanfa[0:stop:1]
        Num_Dianzan      = Num_Dianzan[0:stop:1]

        # 数据除以1000,画图更美观
        for i in range(0,len(Num_Zhuanfa)):
            Num_Zhuanfa[i] = Num_Zhuanfa[i] /1000
        for i in range(0,len(Num_Dianzan)):
            Num_Dianzan[i] = Num_Dianzan[i] /1000

        #将时间转化为时间戳再转化为datetime类型
        aa=[time.strptime(i, "%Y-%m-%d %H:%M") for i in Num_Zhuanfa_time]
        timeStamp = [int(time.mktime(a)) for a in aa]
        Num_Zhuanfa_time=[datetime.fromtimestamp(k) for k in timeStamp]

        # 处理数据量过多的问题
        number = len(Num_Zhuanfa)
        Group = int(0.18 * number)
        k     = number // Group  
        for i in range(0,Group):
            self.X_data.append(Num_Zhuanfa_time[i*k])
            self.Y1_data.append(Num_Dianzan[i*k])

        fig1  = plt.figure(figsize=(8,5))
        plt.rcParams['font.sans-serif'] = ['SimHei'] 
        ax1   = fig1.add_subplot(1,1,1)
        ax1.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d %H-%M'))
        plt.xticks(self.X_data,rotation=90)
        plt.yticks(np.linspace(0,5000,5,endpoint=True)) 
        plt.title(u"2017-2018微博转发/点赞量折线图",color="black")
        plt.plot(self.X_data,self.Y_data,"o-",color='skyblue',label="转发量",markersize=1.5) 
        plt.plot(self.X_data,self.Y1_data,"o-",color='pink',label="点赞量",markersize=1.5)
        plt.xlabel("发布时间")
        plt.ylabel("数量(千/条)")
        plt.legend() 
        plt.show()  

    def analyse_YC(self):
        pattern = re.compile(r'转发理由')

        with open(self.file_name,encoding = "utf-8") as f:
                str = f.read()
        Zhuanfa = pattern.findall(str)
        Number_Zhuanfa = int(len(Zhuanfa))
        Yuanchuang     = self.number - Number_Zhuanfa

        plt.rcParams['font.sans-serif'] = ['SimHei']
        labels = ['转发微博','原创微博']
        sizes  = [Number_Zhuanfa,Yuanchuang]
        explode= (0.1,0)
        plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=False,startangle=150)
        plt.title(u"原创与转发微博量",color="black")
        plt.show()

    def analyse_GJ(self):
        pattern = re.compile(r'发布工具: (.*)\n发布时间')
        with open(self.file_name,encoding = "utf-8") as f:
            str    = f.read()
        number_GJ  = pattern.findall(str)
        #print(number_GJ)
        gongju = dict()
        for i in number_GJ:
            name = i
            if name in gongju:
                gongju[name]+=1
            else:
                gongju[name]=1
        for key in list(gongju.keys()):
            if gongju[key]<=10:
                del gongju[key]

        labels = list(gongju.keys())
        sizes  = list(gongju.values())
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.pie(sizes,labels=labels,autopct='%1.1f%%',shadow=True,startangle=150)
        plt.title(u"微博发布工具统计",color="black")
        plt.show()        

    def analyse_XQ(self):
        pattern = re.compile(r'\[(.{1,4})\].*\[(.{1,4})\]')
        with open(self.file_name,encoding = "utf-8") as f:
            str    = f.read() 
        number_XQ = pattern.findall(str)
       # print(number_XQ)
        a=[]
        for i in range(0,len(number_XQ)):
            for j in (range(0,len(number_XQ[i]))):
                a.append(number_XQ[i][j])

        biaoqing = dict()

        for i in a:
            name = i
            if name in biaoqing:
                biaoqing[name]+=1
            else:
                biaoqing[name]=1  

        for key in list(biaoqing.keys()):
            if biaoqing[key] <= 2:
                del biaoqing[key]    

        labels = list(biaoqing.keys())
        sizes  = list(biaoqing.values())       
        fig1, ax1 = plt.subplots()
        patches, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.0f%%',
        shadow=False, startangle=170)
        ax1.axis('equal')
        #重新设置字体大小
        plt.rcParams['font.sans-serif'] = ['SimHei']
        proptease = fm.FontProperties()
        proptease.set_size('small')   
        plt.title(u"微博表情使用次数",color="black")
        plt.setp(autotexts, fontproperties=proptease)
        plt.setp(texts, fontproperties=proptease)
        plt.show()

python程序打包

#在cmd下安装pyinstaller
pip install pyinstaller
#打包成一个可执行文件 -F (注意将cmd窗口切换至文件保存的路径下)
pyinstaller -F filename.py

本篇只适合新手简单学习,笔者也刚学,加上复习周,后期会逐渐完善,毕竟UI写的太丑了 !
另:关于获取本地用户cookie和微博账号的id操作比较简单在此不再做详细解释。如果程序跑不出来相信我一定是cookie问题

仅供作业参考,抄袭需谨慎

你可能感兴趣的:(python)