用Python爬虫爬取豆瓣TOP250电影

这是我第一个python爬虫项目,用的是python3.5,IDE是pycharm,不足之处,还请多多赐教!


import requests
from bs4 import BeautifulSoup
import time



headers1 = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
    'Cookie':'ll="108088"; gr_user_id=72be189a-6d24-4b66-8852-60ee5b347ef8; __utma=223695111.1753724081.1435500786.1451701876.1453725292.10; __utmz=223695111.1453725292.10.9.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; viewed="6962285_24753651"; bid="G6DEStA1WyY"; __utma=30149280.586657833.1424611223.1457245867.1457318642.32; __utmz=30149280.1454333768.29.18.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.4282; ps=y; ue="[email protected]"; dbcl2="42824569:/o0ASefXQ5E"; ck="FEU7"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1458032271%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D8SixKdmwxL90_AAkojg-EO7bfBM5cXm_ijcfKJwfhvmevsp6lFZb4CJegjU60da5%26wd%3D%26eqid%3Dc16338b2000217060000000356e7ce8a%22%5D; push_noty_num=0; push_doumail_num=4; _pk_id.100001.4cf6=6a661f05ec4bb828.1435500786.22.1458032587.1458029671.; ap=1'}

headers2 = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
    'Cookie':'ll="108088"; gr_user_id=72be189a-6d24-4b66-8852-60ee5b347ef8; __utma=223695111.1753724081.1435500786.1451701876.1453725292.10; __utmz=223695111.1453725292.10.9.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; viewed="6962285_24753651"; bid="G6DEStA1WyY"; __utma=30149280.586657833.1424611223.1457245867.1457318642.32; __utmz=30149280.1454333768.29.18.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.4282; ps=y; ue="[email protected]"; dbcl2="42824569:/o0ASefXQ5E"; ck="FEU7"; ap=1; push_noty_num=0; push_doumail_num=4; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1458035474%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D8SixKdmwxL90_AAkojg-EO7bfBM5cXm_ijcfKJwfhvmevsp6lFZb4CJegjU60da5%26wd%3D%26eqid%3Dc16338b2000217060000000356e7ce8a%22%5D; _pk_id.100001.4cf6=6a661f05ec4bb828.1435500786.23.1458035474.1458032587.; _pk_ses.100001.4cf6=*'}

def get_links_from():
    urls = []
    list_urls = ['https://movie.douban.com/top250?start={}&filter=#!/i!/ckDefault'.format(str(i)) for i in range(0,250,25)]
    for list_url in list_urls:
        wb_data = requests.get(list_url,headers = headers2)
        soup = BeautifulSoup(wb_data.text,'lxml')
        for link in soup.select('div.hd a'):
            urls.append(link.get('href'))
    return (urls)


def get_item_info():
    time.sleep(1)
    urls = get_links_from()
    for url in urls:

        wb_data = requests.get(url,headers=headers1)
        soup = BeautifulSoup(wb_data.text,'lxml')


        title = soup.select('span[property="v:itemreviewed"]')[0].text,
        rank = soup.select('span.top250-no')[0].text,
        director = soup.select('a[rel="v:directedBy"]')[0].text,
        score = soup.select('strong[property="v:average"]')[0].text,
        imgs = soup.select('a.nbgnbg img')[0].get('src')
        print('电影名称:%s,排名:%s,评分:%s,导演:%s,海报:%s' % (title,rank,score,director,imgs))

get_item_info()





你可能感兴趣的:(用Python爬虫爬取豆瓣TOP250电影)