python爬取《你好, 李焕英》豆瓣评论数据

# 导入工具包
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

# 请求头
headers={
     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# =============================================================================
# 爬取一页
# =============================================================================
# 爬取的网址
url='https://movie.douban.com/subject/34841067/comments?limit=20&status=P&sort=new_score'

# 获取信息
html = requests.get(url,headers=headers)
# 获取内容
data = html.text
soup = BeautifulSoup(data,'lxml')

# 信息
# 用户
names = soup.select('#comments > div > div.comment > h3 > span.comment-info > a')
# 评级
pingjis = soup.select('#comments > div > div.comment > h3 > span.comment-info')
# 日期
riqis = soup.select('#comments > div > div.comment > h3 > span.comment-info > span.comment-time')
# 内容
neirongs = soup.select('#comments > div > div.comment > p > span')

# 空list
lis=[]
for name,pingji,riqi,neirong in zip(names,pingjis,riqis,neirongs):
    pingji_re = pingji.find_all('span')
    lis.append([name.get_text(),
                pingji_re[1]['class'],
                pingji_re[1]['title'],
                riqi.get_text().strip(),
                neirong.get_text()])
    
result1 = pd.DataFrame(lis,columns=['用户','评级','等级','日期','内容'])


# =============================================================================
# 爬取多页
# =============================================================================
url = ['https://movie.douban.com/subject/34841067/comments?start={}&limit=20&status=P&sort=new_score'.format(i) for i in range(0,100,20)]

lis2 = []

for urli in url:
    # 获取信息
    html = requests.get(urli,headers=headers)
    # 获取内容
    data = html.text
    soup = BeautifulSoup(data,'lxml')
    # 用户
    names = soup.select('#comments > div > div.comment > h3 > span.comment-info > a')
    # 评级
    pingjis = soup.select('#comments > div > div.comment > h3 > span.comment-info')
    # 日期
    riqis = soup.select('#comments > div > div.comment > h3 > span.comment-info > span.comment-time')
    # 内容
    neirongs = soup.select('#comments > div > div.comment > p > span')

    for name,pingji,riqi,neirong in zip(names,pingjis,riqis,neirongs):
        pingji_re = pingji.find_all('span')
        lis2.append([name.get_text(),
                    pingji_re[1]['class'],
                    pingji_re[1]['title'],
                    riqi.get_text().strip(),
                    neirong.get_text()])
    print('完成:',urli)
    time.sleep(np.random.randint(5,10))

result2 = pd.DataFrame(lis2,columns=['用户','评级','等级','日期','内容'])       

视频介绍链接: https://edu.csdn.net/course/detail/31518

你可能感兴趣的:(网络爬虫)