爬虫豆瓣完整版

from bs4 import BeautifulSoup
import requests, time, pymongo
from multiprocessing import Pool

user_agent = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
headers ={'User-Agent':user_agent}

proxies = {'http':'60.178.173.125'}
#创建数据库
client = pymongo.MongoClient('localhost', 27017)    #激活客户端
Douban = client['douban']   #创建数据库名称
url_list = Douban['url_list']   #创建表用于存放每个电影的链接
item_list = Douban['item_list']     #创建表用于放每个电影的详细信息

#获取250个电影的列表页,一页有25个电影,所以是10页
start_url = ['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0, 250, 25)]
url0 = 'https://movie.douban.com/top250?start=0&filter='

#获取某一个电影详情信息
url_one = 'https://movie.douban.com/subject/6146955/'
url_two = 'https://movie.douban.com/subject/1291546/'
url_404 = 'https://movie.douban.com/subject/5912992/'

link_error = []     #用于存放
def get_movie_info(url):
    wb_data = requests.get(url,headers = headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    time.sleep(4)
    if (soup.find('title').text == "页面不存在"):    #如果详情页不存在就跳过(其实应该在导航页爬取,这样排名会完整,只不过缺失一些信息
        link_error.append(url)
        pass
    else:
        genres = soup.find_all(property = "v:genre")    #类型(类型会有多个,是一个列表)
        runtime = soup.find(property = "v:runtime")     #时长
        ytime = soup.find(property = "v:initialReleaseDate")        #年份(只要年份不要月)
        movie = {
            'number': soup.select('div.top250 > span.top250-no')[0].string,
            'title': soup.find(property="v:itemreviewed").string,  # 定位电影名
            'score': soup.find(property = "v:average").string,    #定位电影评分
            'comments_count': soup.find(property = "v:votes").string,    #定位评论数量
            'genre': list(i.string for i in genres),
            'runtime': soup.find(property = "v:runtime").string if soup.find_all('span','v:runtime') else None,
            'year':ytime.string[0:4],
            'url': url
        }
        #有些runtime没有属性,特殊处理
        if (movie['runtime'] == None):
            # item_list.insert_one(movie)      #插入数据库中
            s = soup.find_all(id = 'info')[0].get_text(strip = True)
            # print(s.find("分"))
            k = s.find("分")
            j = s.find("片长")
            # print(s[j+3:k+1])
            # print(movie['runtime'])
            movie['runtime'] = s[j+3:k+1]
        print(movie)
        item_list.insert_one(movie)      #插入数据库中


# get_movie_info(url_one)

#获取每一页的电影链接
def get_movie_link(url):
    wb_data = requests.get(url,headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    links = soup.select('div > div.info > div.hd > a ')

    for i in links:
        item_link = i.get('href')
        url_list.insert_one({'url':item_link})      #将电影链接放入数据库中
        print(item_link)

#将所有页里面的链接放入links4
# for i in start_url:
#     get_movie_link(i)
#


#将数据库中存取的电影链接提取给get_movie_info,进行详情页的爬取
# for i in url_list.find(no_cursor_timeout  = True):      #默认MongoDB过十分钟还没有处理完毕会异常,加参数永不停,要手动关闭
#     # print(i['url'])
#     item_url = i['url']
#     get_movie_info(item_url)

#断点设计
db_urls = [item['url'] for item in url_list.find()]     #找到总共的url
index_urls = [item['url'] for item in item_list.find()]     #已经抓取的url
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y
# for i in rest_of_urls:
#     get_movie_info(i)
print(rest_of_urls)
print(len(link_error))
# #数据库操作
# # print(url_list.find()['url'])     #总数
# print(url_list.count())


 爬虫豆瓣完整版_第1张图片

你可能感兴趣的:(爬虫豆瓣完整版)