python 爬取豆瓣电影(动态加载页面)

有的页面是点击页数翻页,有的是点击加载翻页。
下面的代码解决加载翻页。

import requests


def getHTMLtext(url):
    try:
        r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (K'
                                                     'HTML, like Gecko) Chrome/69.0.3497.100 Safa'
                                                     'ri/537.36', 'Referer': 'https://movie.douban.com/tag/',
                                                     'Host': 'movie.douban.com'})
        r.raise_for_status()
        return r.json()
        # 返回json,在浏览器审查,network当中的xrh里面可以检测到,用到了JSONView浏览器插件
    except:
        return print('异常')


def parsehtml(html):
    for i in range(20):
        # 定位到每个电影下,并返回一个字典
        file = html['data'][i]
        title = file['title']
        rate = file['rate']
        casts = ' '.join(file['casts'])
        url = file['url']
        with open(r'C:\Users\zj\Desktop\doubanlove.csv', 'a', encoding='utf-8') as f:
            f.write(title + ',' + rate + ',' + casts + ',' + url + '\n')


def main():
    for i in range(5):
        url = "https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start={}&genres=爱情".format(i * 20)
        # 分类“爱情”下的爬取
        html = getHTMLtext(url)
        parsehtml(html)


if __name__ == '__main__':
    main()

你可能感兴趣的:(学习笔记)