python爬虫获取js动态资源

# coding:utf-8
# 电视猫网址:https://www.tvmao.com/program
import requests
from lxml import etree
from selenium import webdriver

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/69.0.3497.100 Safari/537.36',
           'Referer': 'https://www.tvmao.com/'}
url_list = ['https://www.tvmao.com/program/duration/cctv/w{}.html',
            'https://www.tvmao.com/program/duration/satellite/w{}.html']
# 央视和卫视
for url_program in url_list:
    # 星期一到星期日
    for i in range(1, 8):
        url = url_program.format(i)
        html = requests.get(url, headers=headers).text
        html = etree.HTML(html)
        td = html.xpath('//td[@class="tdchn"]')
        # 电视台
        for j in td:
            href = 'https://www.tvmao.com'+j.xpath('./a/@href')[0]
            name_tv = j.xpath('./a/text()')[0]
            # 获取js动态加载内容
            driver = webdriver.Chrome(executable_path='C:\Program Files (x86)\Google\Chrome'
                                                      '\Application\chromedriver.exe')
            driver.get(href)
            program = driver.page_source
            program = etree.HTML(program)
            li = program.xpath('//ul[@id="pgrow"]/li|//ul[@id="pgrow"]/li[not(@class)]')
            # 节目单
            for k in li:
                name_time = k.xpath('.//span/text()')
                name = k.xpath('.//a/text()')
                if name_time:  # 判断列表是否为空
                    name_time = name_time[0]
                else:
                    continue
                if name:
                    name = name[0]
                else:
                    continue
                print(name_time)
                print(name)

你可能感兴趣的:(爬虫)