爬虫爬取豆瓣Top250电影 生成本地.txt文件

import requests
from bs4 import BeautifulSoup

#request 豆瓣url,修改一下headers
def open_url(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
    res = requests.get(url, headers=headers)
    return res

#获取top250电影总共有多少页面
def find_pages(res):
    soup = BeautifulSoup(res.text, "html.parser")
    page = soup.find("span", class_="next")
    nums = page.previous_sibling.previous_sibling.text
    return int(nums)

#获取top250电影相关信息
def find_movies(res):
    result=[]
    soup = BeautifulSoup(res.text, "html.parser")
    # 收集电影名字
    movie_name=[]
    targets = soup.find_all("div", class_="hd")
    for each in targets:
        movie_name.append(each.a.span.text)
    # 收集评分
    grade=[]
    targets = soup.find_all("span", class_="rating_num")
    for each in targets:
        mark = "电影评分为:"+each.text
        grade.append(mark)
    # 收集电影信息
    message= []
    targets = soup.find_all("div",class_="bd")
    for each in targets:
        each_split = each.p.text.split("\n")
        try:
            each_message = each_split[1].strip() + each_split[2].strip()
        except:
            continue
        message.append(each_message)
    for i in range(len(movie_name)):
        result.append(movie_name[i]+grade[i]+message[i])
    return result


def main():
    url ="https://movie.douban.com/top250"
    res =open_url(url)
    pages = find_pages(res)
    result = []
    for i in range(pages):
        url="https://movie.douban.com/top250"+"?start="+str(i*25)+"&filter="
        res = requests.get(url)
        result.extend(find_movies(res))
    with open("豆瓣电影评分top250.txt", "w",encoding="utf-8") as f:
        for each in result:
            f.write(each+"\n")


if __name__ == '__main__':
    main()

 

你可能感兴趣的:(爬虫爬取豆瓣Top250电影 生成本地.txt文件)