Python——爬取直播网站房间名及热度

斗鱼直播

# coding=utf-8
'''
    爬取斗鱼直播房间名和人气值
'''
# 导入selenium工具
import time
from selenium import webdriver
from lxml import etree

class Douyu(object):

    # 初始化
    def __init__(self):
        # 通过浏览器加载网页
        self.driver = webdriver.PhantomJS()
        # 要统计的数量
        self.room_count = 0   # 房间数量
        self.hot_count = 0   # 热度

    # 获取房间名和人气
    def run(self):
        # 打开网页
        # self.driver.get('https://www.douyu.com/directory/all')
        # 爬取相关的内容
        content = etree.HTML(self.driver.page_source)
        rooms = content.xpath('//li[@class="layout-Cover-item"]/div[@class="DyListCover HeaderCell is-href"]/a/div[@class="DyListCover-content"]')
        for room in rooms:
            # 获取房间名称
            tmp = room.xpath('./div[@class="DyListCover-info"]/h3[@class="DyListCover-intro"]/text()')
            roomname = tmp[0]
            # 获取人气
            tmp = room.xpath('./div[@class="DyListCover-info"]/span[@class="DyListCover-hot"]/text()')
            hot = tmp[0]
            print('人气:'+hot+';房间:'+roomname)
            # 增加房间数量
            self.room_count += 1
            # 增加人气数量
            if hot[-1] == '万':
                hot = hot[:-1]
                hot = int(float(hot) * 10000)
                self.hot_count += hot
            else:
                hot = int(hot)
                self.hot_count += hot
        # 输出结果
        print('当前直播房间总量:',self.room_count)
        print('当前人气总数:',self.hot_count)

    # 遍历页数
    def test(self):
        # 打开网页
        self.driver.get('https://www.douyu.com/directory/all')
        # 循环遍历每一页
        page = 0
        while True:
            # 延迟一点
            time.sleep(5)
            page += 1
            # 尝试查找laypage_next
            ret = self.driver.find_element_by_class_name('dy-Pagination-next').get_attribute("aria-disabled")
            if ret.lower() == 'false':
                print('-'*30+'第' + str(page) + '页'+'-'*30)
                self.run()
            else:
                print('-'*30+'最后一页'+'-'*30)
                break
            self.driver.find_element_by_class_name('dy-Pagination-next').click()

if __name__ == '__main__':
    dy = Douyu()
    dy.test()


虎牙直播

# coding=utf-8
'''
    爬取虎牙直播房间名和人气值
'''
# 导入selenium工具
import time
from selenium import webdriver
from lxml import etree

class Huya(object):

    # 初始化
    def __init__(self):
        # 通过浏览器加载网页
        self.driver = webdriver.PhantomJS()
        # 要统计的数量
        self.room_count = 0   # 房间数量
        self.hot_count = 0   # 热度
        # self.rooms_count = 0
        # self.hots_count = 0

    # 获取房间名和人气
    def run(self):
        # 打开网页
        # self.driver.get('https://www.huya.com/l')
        # 爬取相关的内容
        content = etree.HTML(self.driver.page_source)
        rooms = content.xpath('//li[@class="game-live-item"]')
        for room in rooms:
            # 获取房间名称
            tmp = room.xpath('./a[@class="title new-clickstat"]/text()')
            roomname = tmp[0]
            # 获取人气
            tmp = room.xpath('./span[@class="txt"]/span[@class="num"]/i[@class="js-num"]/text()')
            hot = tmp[0]
            print('房间:'+roomname+'; 人气:'+str(hot))
            # 增加房间数量
            self.room_count += 1
            # 增加人气数量
            if hot[-1] == '万':
                hot = hot[:-1]
                hot = int(float(hot) * 10000)
                self.hot_count += hot
            else:
                hot = int(hot)
                self.hot_count += hot
        # 输出结果
        print('当前直播房间总量:',self.room_count)
        print('当前人气总数:',self.hot_count)

    # 遍历页数
    def test(self):
        # 打开网页
        self.driver.get('https://www.huya.com/l')
        # 循环遍历每一页
        page = 0
        while True:
            # 延迟一点
            time.sleep(5)
            page += 1
            # 尝试查找laypage_next
            ret = self.driver.page_source.find('laypage_next')
            if ret >= 0:
                print('-'*30+'第' + str(page) + '页'+'-'*30)
                self.run()
            else:
                print('-'*'最后一页'+'-'*30)
                break
            self.driver.find_element_by_class_name('laypage_next').click()

if __name__ == '__main__':
    huya = Huya()
    huya.test()

总结

  1. xpath 要填写正确
  2. 每个网站的翻页方式不同
  3. 灵活使用 find_element_by_class_name 方法以及 get_attribute 方法

你可能感兴趣的:(Python)