Python爬取豆瓣电影TOP250

部分代码引用于此:链接
运行于macOS 10.13.6 python2.7
代码:

# -*- coding:utf-8 -*-
import urllib
import urllib2
# useragent 存放着各个浏览器的User-Agent,自己写的模块,非系统库
import useragent
import BeautifulSoup
import re
import csv
import sys

url = 'https://movie.douban.com/top250'


def get_movie_info(req):
    res = urllib2.urlopen(req)
    page = res.read()
    soup = BeautifulSoup.BeautifulSoup(page)
    data = soup.find('ol', {'class': 'grid_view'})
    li = data.findAll('li')
    record = []
    for l in li:
        rank = l.find('em').getText()
        name = l.find('img')['alt']
        info = l.find('p').getText()
        director = re.findall('导演: (.*?)   ', info.encode('utf-8'))
        if len(director) == 0:
            director = '佚名'
        else:
            director = director[0]
        starring = re.findall('主演: (.*?) /...', info.encode('utf-8'))
        if len(starring) == 0:
            starring = '佚名'
        else:
            starring = starring[0]
        year = re.search(r'\d{4}', info).group()
        area = re.findall('/ (.*?) ', info)[0]
        grade = l.findAll('span', {'class': 'rating_num'})[0].getText()
        quote = l.findAll('span', {'class': 'inq'})
        if len(quote) == 0:
            quote = '无'
        else:
            quote = quote[0].getText()
        record.append([rank, name, director, starring, year, area, grade, quote])
    return record


def start(url):
    head = ['排名', '名字', '导演', '主演', '年份', '地区', '评分', '简介']
    with open('doubantop250.csv', mode='w') as f:
        fd = csv.writer(f)
        fd.writerow(head)
        for page in range(0, 250, 25):
            user_agent = useragent.osx_user_agent
            values = {'start': page, 'filter': None}
            headers = {'User-Agent': user_agent}
            data = urllib.urlencode(values)
            request = urllib2.Request(url=url, data=data, headers=headers)
            print url + '?' + request.data
            movie_info = get_movie_info(request)
            for record in movie_info:
                fd.writerow(record)

def main():
    reload(sys)
    sys.setdefaultencoding('utf-8')
    start(url)


if __name__ == '__main__':
    main()

你可能感兴趣的:(Python爬取豆瓣电影TOP250)