9_通过selenium对斗鱼所有房间号、主播名、观看人数提取

from selenium import webdriver
import time

class Douyu(object):

    def __init__(self):
        self.url = 'https://www.douyu.com/directory/all'
        self.driver = webdriver.Chrome()
        # self.data_list = []

    def parse_data(self):
        # 保证页面基本加载完后开始定位数据,防止数据还没加载出,已经开始定位会失败
        time.sleep(5)
        # 将所有的元素块存入列表
        page = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
        print(len(page))
        data_list = []
        time.sleep(5)
        for tmp in page:
            temp = {}
            # 开始基于已定位的元素块为根,寻找所需的元素:名称、类型、房主、观看人数
            temp['tp'] = tmp.find_element_by_xpath('./a[1]/div[2]/div[1]/span').text
            temp['owner'] = tmp.find_element_by_xpath('./a[1]/div[2]/div[1]/h3').text
            temp['room'] = tmp.find_element_by_xpath('./a[1]/div[2]/div[2]/h2').text
            temp['num'] = tmp.find_element_by_xpath('./a[1]/div[2]/div[2]/span').text
            # 将字典添加到列表中
            data_list.append(temp)
        return data_list

    # 保存数据
    def save_data(self,data_list):
        # 对列表遍历输出
        for data in data_list:
            print(data)
        

    def run(self):
        # 1. url
        # 2. driver
        # 3. get
        self.driver.get(self.url)
        while True:
        # 4. parse_data
            # 对网页下滑,加载出当前页面全部,防止定位后面元素出错,也为了定位下一页做准备
            js = 'scrollTo(0,100000)'
            self.driver.execute_script(js)
            data_list = self.parse_data()
        # 5. save_data
            self.save_data(data_list)
            try:
                # 当下一页可以点击时,正常点击,否则终止循环
                self.driver.find_element_by_xpath('//*[contains(text(),"下一页")]').click()
            except:
                break
if __name__ == "__main__":
    douyu = Douyu()
    douyu.run()

你可能感兴趣的:(爬虫)