python爬虫笔记二:爬取豆瓣中的影评

从这个网址学习的:https://zhuanlan.zhihu.com/p/399300580

------------------------------------------------------------------------

https://movie.douban.com/subject/25728006/comments 点击下一页

https://movie.douban.com/subject/25728006/comments?start=20&limit=20&status=P&sort=new_score

打开页面F12进入开发者工具,查看

python爬虫笔记二:爬取豆瓣中的影评_第1张图片

requests.get(url, params= params, headers=headers) 

 其中获取网页需要url(网址)、 params(参数) 、headers(头文件)

url=https://movie.douban.com/subject/25728006/comments

params={
    'percent_type':'',
    'start':'20',
    'limit':'20',
    'status':'P',
    'sort':'new_score',
    'comments_only':'1',
}

headers={
    'Accept':'application/json, text/plain, */*',
    'Accept-Encoding':'gzip, deflate, br',
    'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
    'Connection':'keep-alive',
    'Cookie':'ll="118254"; bid=bzf7LGz3pZA....',
    'Host':'movie.douban.com',
    'Referer':'https://movie.douban.com/subject/25728006/comments?start=20&limit=20&status=P&sort=new_score',
    'sec-ch-ua':'" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
    'sec-ch-ua-mobile':'?0',
    'Sec-Fetch-Dest':'empty',
    'Sec-Fetch-Mode':'cors',
    'Sec-Fetch-Site':'same-origin',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}

代码查看

import requests
import re
import pandas as pd
import os

headers={
    'Cookie':'ll="118254"; bid=bzf7LGz3pZA; _vwo_uuid_v2=DB12523A0B0C7127645E914A1FB363352|3d83981785084d997d7462a2ce24a947; __utmz=223695111.1626234491.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; douban-fav-remind=1; __utmz=30149280.1629095213.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1629249503%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D71zldwjBiMBa-xfexgVZ43eTQq2n8KKtTWTsWh37m72e_lfEOE1x3NuDj6egeYBLyqGE4gjSJnbxueQLcYZWsq%26wd%3D%26eqid%3Ddb6736ec000219350000000660ee5e6f%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1335619985.1616046306.1629187696.1629249503.8; __utmc=30149280; __utmt=1; __utmb=30149280.1.10.1629249503; __utma=223695111.444014824.1616046306.1629187696.1629249506.7; __utmb=223695111.0.10.1629249506; __utmc=223695111; _pk_id.100001.4cf6=fa72408676bee41c.1616046306.7.1629250023.1629187696.',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 请求网页数据
def get_html(tid, page, _type):
    """
    tid:产品的id,比如 速度与激情9的产品id是25728006
    page:短评页码,0-24
    headers:请求头,需要带浏览器和cookie信息
    _type:评价类型(好评:h,中评:m,差评:l),为空就是全部
    """
    url = f'https://movie.douban.com/subject/{tid}/comments?'

    params = {
        'percent_type': _type,
        'start': page * 20,
        'limit': 20,
        'status': 'P',
        'sort': 'new_score',
        'comments_only': 1,
        'ck': 't9O9',
    }

    r = requests.get(url, params=params, headers=headers)
    # 请求数据为json
    data = r.json()
    html = data['html']

    # 我们正则处理,所以先将空字符去掉
    html = re.sub('\s', '', html)

    return html


print(get_html('25728006',0,''))

 结果选取第一个用户:

python爬虫笔记二:爬取豆瓣中的影评_第2张图片

 完整关于作者、日期、评价内容、有用数据以及星数的解析过程如下

# 解析数据
def get_data(html):
    df = pd.DataFrame(columns=['作者', '日期', '评论', '投票数', 'star'])
    df["作者"] = re.findall('(.*?)', html)
    df["评论"] = re.findall('"short">(.*?)', html)
    df["投票数"] = re.findall('"votesvote-count">(\d+)', html)
    #当_type:==‘’时,下面这一行获取的列表长度可能会报错:ValueError: Length of values (19) does not match length of index (20)
    # 原来没有评分的则这项为空,_type:==‘’只能暂时去掉这项
    xx=re.findall('

上面的方法是使用的正规则来获取的,其实可以用BeautifulSoup结合lxml来获取,使用方法见笔记一

 数据存储:这里将数据存储为csv文件,主要是追加写入比较方便,最好能写入到数据库后期好分析。

# 存储数据
def save_df(df):
    if os.path.exists('data.csv'):
        df.to_csv('data.csv',index=None,mode='a',header=None,encoding='utf_8_sig')
    else:
        df.to_csv('data.csv',index=None,encoding='utf_8_sig')

主进程调用

if __name__ == '__main__':
    # 产品id,比如 速度与激情9的产品id是25728006
    tid = 25728006
    # 评价类型(好评:h,中评:m,差评:l)
    _type = 'l'
    for page in range(10):
        html = get_html(tid,page,_type)
        df = get_data(html)
        save_df(df)
        print(f'{page+1}页评价已采集..')

结果

python爬虫笔记二:爬取豆瓣中的影评_第3张图片 python爬虫笔记二:爬取豆瓣中的影评_第4张图片

完整代码

import requests
import re
import pandas as pd
import os

headers={
    'Cookie':'ll="118254"; bid=bzf7LGz3pZA; _vwo_uuid_v2=DB12523A0B0C7127645E914A1FB363352|3d83981785084d997d7462a2ce24a947; __utmz=223695111.1626234491.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; douban-fav-remind=1; __utmz=30149280.1629095213.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1629249503%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D71zldwjBiMBa-xfexgVZ43eTQq2n8KKtTWTsWh37m72e_lfEOE1x3NuDj6egeYBLyqGE4gjSJnbxueQLcYZWsq%26wd%3D%26eqid%3Ddb6736ec000219350000000660ee5e6f%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1335619985.1616046306.1629187696.1629249503.8; __utmc=30149280; __utmt=1; __utmb=30149280.1.10.1629249503; __utma=223695111.444014824.1616046306.1629187696.1629249506.7; __utmb=223695111.0.10.1629249506; __utmc=223695111; _pk_id.100001.4cf6=fa72408676bee41c.1616046306.7.1629250023.1629187696.',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 请求网页数据
def get_html(tid, page, _type):
    """
    tid:产品的id,比如 速度与激情9的产品id是25728006
    page:短评页码,0-24
    headers:请求头,需要带浏览器和cookie信息
    _type:评价类型(好评:h,中评:m,差评:l),为空就是全部
    """
    url = f'https://movie.douban.com/subject/{tid}/comments?'

    params = {
        'percent_type': _type,
        'start': page * 20,
        'limit': 20,
        'status': 'P',
        'sort': 'new_score',
        'comments_only': 1,
        'ck': 't9O9',
    }

    r = requests.get(url, params=params, headers=headers)
    # 请求数据为json
    data = r.json()
    html = data['html']
    # 我们正则处理,所以先将空字符去掉: \s 匹配任意空白字符,等价于 [\t\n\r\f]。
    html = re.sub('\s', '', html)

    return html


# 解析数据
def get_data(html):
    df = pd.DataFrame(columns=['作者', '日期', '评论', '投票数', 'star'])
    df["作者"] = re.findall('(.*?)', html)
    df["评论"] = re.findall('"short">(.*?)', html)
    df["投票数"] = re.findall('"votesvote-count">(\d+)', html)
    #当_type:==‘’时,下面这一行获取的列表长度可能会报错:ValueError: Length of values (19) does not match length of index (20)
    # 原来没有评分的则这项为空,_type:==‘’只能暂时去掉这项
    xx=re.findall('

你可能感兴趣的:(python)