【转】[爬虫] 爬取豆瓣TOP250电影排行榜

 

原文链接:

https://fishc.com.cn/thread-94979-1-1.html

  1. import requests
  2. import bs4
  3. import re
  4.  
  5. def open_url(url):
  6.     # 使用代理
  7.     # proxies = {"http": "127.0.0.1:1080", "https": "127.0.0.1:1080"}
  8.     headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
  9.  
  10.     # res = requests.get(url, headers=headers, proxies=proxies)
  11.     res = requests.get(url, headers=headers)
  12.  
  13.     return res
  14.  
  15. def find_movies(res):
  16.     soup = bs4.BeautifulSoup(res.text, 'html.parser')
  17.  
  18.     # 电影名
  19.     movies = []
  20.     targets = soup.find_all("div", class_="hd")
  21.     for each in targets:
  22.         movies.append(each.a.span.text)
  23.  
  24.     # 评分
  25.     ranks = []
  26.     targets = soup.find_all("span", class_="rating_num")
  27.     for each in targets:
  28.         ranks.append(' 评分:%s ' % each.text)
  29.  
  30.     # 资料
  31.     messages = []
  32.     targets = soup.find_all("div", class_="bd")
  33.     for each in targets:
  34.         try:
  35.             messages.append(each.p.text.split('\n')[1].strip() + each.p.text.split('\n')[2].strip())
  36.         except:
  37.             continue
  38.  
  39.     result = []
  40.     length = len(movies)
  41.     for i in range(length):
  42.         result.append(movies[i] + ranks[i] + messages[i] + '\n')
  43.  
  44.     return result
  45.  
  46. # 找出一共有多少个页面
  47. def find_depth(res):
  48.     soup = bs4.BeautifulSoup(res.text, 'html.parser')
  49.     depth = soup.find('span', class_='next').previous_sibling.previous_sibling.text
  50.  
  51.     return int(depth)
  52.  
  53. def main():
  54.     host = "https://movie.douban.com/top250"
  55.     res = open_url(host)
  56.     depth = find_depth(res)
  57.  
  58.     result = []
  59.     for i in range(depth):
  60.         url = host + '/?start=' + str(25 * i)
  61.         res = open_url(url)
  62.         result.extend(find_movies(res))
  63.  
  64.     with open("豆瓣TOP250电影.txt", "w", encoding="utf-8") as f:
  65.         for each in result:
  66.             f.write(each)
  67.     
  68. if __name__ == "__main__":
  69.     main()

你可能感兴趣的:(【转】[爬虫] 爬取豆瓣TOP250电影排行榜)