使用 BeautifulSoup 爬取豆瓣网 top250

使用 requests 和 BeautifulSoup 爬虫小例子,直接上代码:

# coding:utf-8

import requests
from bs4 import BeautifulSoup
import codecs


URL = "https://movie.douban.com/top250"
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0'}


def download_page(url):
    data = requests.get(url, headers=HEADERS).content
    return data

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    # 测试时可以使用print soup.prettify()打印查看获得的页面
    # 根据css获取页面信息
    movie_list_ol = soup.find('ol', attrs={'class':'grid_view'}) 
    movie_name_list = []
    # 遍历页面中有关的信息
    for movie_li in movie_list_ol.find_all('li'):
        # 电影描述
        detail = movie_li.find('div', attrs={'class':'hd'})
        # 电影名字
        movie_name = detail.find('span', attrs={'class':'title'}).getText() 
        movie_name_list.append(movie_name)
    # 找到下一页 
    next_page = soup.find('span', attrs={'class':'next'}).find('a') 
    if next_page:
        # 拼接下一页的url,继续爬取下一页         
        return movie_name_list, URL + next_page['href'] 
    return movie_name_list, None

def main():
    url = URL
    with codecs.open('movies.txt', 'w', encoding='utf-8') as fp: 
        movies_all = []
        while url:
            html = download_page(url)
            movies, url = parse_html(html) 
            movies_all.extend(movies)
        for index, movie in enumerate(movies_all):
            index += 1
            # 将获得的信息写入文件            
            fp.write('{index}.{movie}\n'.format(index=index, movie=movie)) 
            

if __name__ == '__main__':
    main()

你可能感兴趣的:(使用 BeautifulSoup 爬取豆瓣网 top250)