#!/usr/bin/env python # coding: utf-8 import re import urllib2 class doubanTop10: def __init__(self): self.url = 'http://movie.douban.com/chart' self.datas = [] self._top_num = 1 print "正在爬取豆瓣新片榜...\n" def get_data(self, url): url = self.url try: page_data = urllib2.urlopen(url).read().decode('utf-8') except urllib2.URLError, e: if hasattr(e, 'code'): print "The server couldn't fulfill the request." print "Error code: %s" % e.code elif hasattr(e, 'reason'): print "We failed to reach a server. Please check your url and read the Reason." print "Reason: %s" % e.reason return page_data def find_title(self, page_data): temp_data = [] # print page_data movie_items = re.findall(r'<a.*?class="nbg".*?title="(.*?)">', page_data, re.S) for index, item in enumerate(movie_items): if item.find(" ") == -1: temp_data.append("Top" + str(self._top_num) + " " + item) self._top_num += 1 self.datas.extend(temp_data) def start_spider(self): my_page = self.get_data(self.url) self.find_title(my_page) def main(): spider = doubanTop10() spider.start_spider() for item in spider.datas: print item print "\n爬取完成!" if __name__ == '__main__': main()