豆瓣电影 top250 爬虫

发现没什么好说的,主要是这个 pyquery 库比较好用,能实现像 操纵DOM 一样解析网页。
主要功能:

  1. 将爬取的网页先保存到本地,然后解析,避免重复请求。
  2. 将解析的结果保存到 MongoDB。
import requests
import pymongo
from pyquery import PyQuery as pq


class Model(object):
    """
    基类, 用来显示类的信息
    """

    def __repr__(self):
        name = self.__class__.__name__
        properties = ('{}=({})'.format(k, v) for k, v in self.__dict__.items())
        s = '\n<{} \n  {}>'.format(name, '\n  '.join(properties))
        return s


class Movie(Model):
    """
    存储电影信息
    """

    def __init__(self):
        self.name = ''
        self.score = 0
        self.quote = ''
        self.cover_url = ''
        self.ranking = 0


def cached_url(url):
    """
    缓存, 避免重复下载网页浪费时间
    """
    folder = 'cached'
    filename = url.split('=', 1)[-1] + '.html'
    path = os.path.join(folder, filename)
    if os.path.exists(path):
        with open(path, 'rb') as f:
            s = f.read()
            return s
    else:
        # 建立 cached 文件夹
        if not os.path.exists(folder):
            os.makedirs(folder)

        headers = {
            'user-agent': '''Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8''',
        }
        # 发送网络请求, 把结果写入到文件夹中
        r = requests.get(url, headers)
        with open(path, 'wb') as f:
            f.write(r.content)
        return r.content


def movie_from_div(div):
    """
    从一个 div 里面获取到一个电影信息
    """
    e = pq(div)

    # 小作用域变量用单字符
    m = Movie()
    m.name = e('.title').text()
    m.score = e('.rating_num').text()
    m.quote = e('.inq').text()
    m.cover_url = e('img').attr('src')
    m.ranking = e('.pic').find('em').text()

    return m


def movies_from_url(url):
    """
    从 url 中下载网页并解析出页面内所有的电影
    """
    page = cached_url(url)
    e = pq(page)
    # 2.父节点
    items = e('.item')
    # 调用 movie_from_div
    # list comprehension
    movies = [movie_from_div(i) for i in items]
    return movies


def download_image(url, file):
    folder = "img"
    name = file.split("/")[0] + '.jpg'
    path = os.path.join(folder, name)

    if not os.path.exists(folder):
        os.makedirs(folder)

    if os.path.exists(path):
        return

    headers = {
        'user-agent': '''Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8''',
    }
    # 发送网络请求, 把结果写入到文件夹中
    r = requests.get(url, headers)
    with open(path, 'wb') as f:
        f.write(r.content)


def savemovies(movies):
    '''
    保存到 MongoDB
    '''
    connection = pymongo.MongoClient()
    DoubanMovies_db = connection.DoubanMovies_db
    Movietable = DoubanMovies_db.movies

    for m in movies:
        movie = {}
        movie['name'] = m.name
        movie['score'] = m.score
        movie['quote'] = m.quote
        movie['ranking'] = m.ranking
        movie['cover_url'] = m.cover_url
        Movietable.insert_one(movie)


def main():
    for i in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start={}'.format(i)
        movies = movies_from_url(url)
        savemovies(movies)
        print('top250 movies', movies)
        [download_image(m.cover_url, str(m.name)) for m in movies]


if __name__ == '__main__':
    main()

你可能感兴趣的:(豆瓣电影 top250 爬虫)