python爬取豆瓣图书

最近突然想学下爬虫爬取一下豆瓣的图书,按类别来爬取并分别存储,然后就用正则写了一份初级爬虫,目前只是分类的页面爬取,后面完善一下,希望能够得到每本书的isbn编号,生成自己的数据库。

# -*- coding:utf-8 -*-
import urllib2
import re
import sys

tags = [u'小说', u'散文', u'历史', u'爱情', u'管理', u'编程', u'生活', u'心理']
haveBooked = set()


class BookSpider:
    def __init__(self):
        reload(sys)
        sys.setdefaultencoding('utf-8')
        self.start = 0
        self.tagIndex = 0
        self.param = '&filter=&type='
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)'}
        self.filePath = 'DoubanTop250.txt'

    def GetPage(self):
        try:
            URL = 'https://book.douban.com/tag/' + tags[self.tagIndex]
            request = urllib2.Request(url=URL, headers=self.headers)
            response = urllib2.urlopen(request)
            page = response.read().decode('utf-8')
            self.start += 20
            pageNum = self.start / 20
            print '正在抓取 ' + tags[self.tagIndex] + ' 的第' + str(pageNum) + '页数据'
            return page
        except urllib2.URLError, e:
            if hasattr(e, 'reason'):
                print '抓取失败,具体原因:', e.reason

    def WriteBookTitle(self, titleInfo, fileBook):
        patternTitle = re.compile(u'(.*?)(.*?).*?', re.S)
        result = re.match(patternTitle, titleInfo)
        if result is None:
            fileBook.write('title: ' + titleInfo + '\r\n')
        else:
            titles = re.findall(patternTitle, titleInfo)
            fileBook.write('title: ' + titles[0][0].strip() + titles[0][1].strip() + '\r\n')

    def WriteBookPubInfo(self, pubInfo, fileBook):
        pubInfo += '!'
        patternPub1 = re.compile(u'(.*?)/(.*?)/(.*?)/(.*?)/(.*?)!', re.S)
        patternPub2 = re.compile(u'(.*?)/(.*?)/(.*?)/(.*?)!', re.S)
        result = re.match(patternPub1, pubInfo)
        if result is None:
            pubs = re.findall(patternPub2, pubInfo)
            fileBook.write('author: ' + pubs[0][0].strip() + '\r\n')
            fileBook.write('pubHouse: ' + pubs[0][1].strip() + '\r\n')
            fileBook.write('pubData: ' + pubs[0][2].strip() + '\r\n')
            fileBook.write('price: ' + pubs[0][3].strip() + '\r\n\r\n')
        else:
            pubs = re.findall(patternPub1, pubInfo)
            fileBook.write('author: ' + pubs[0][0].strip() + '\r\n')
            fileBook.write('translator: ' + pubs[0][1].strip() + '\r\n')
            fileBook.write('pubHouse: ' + pubs[0][2].strip() + '\r\n')
            fileBook.write('pubData: ' + pubs[0][3].strip() + '\r\n')
            fileBook.write('price: ' + pubs[0][4].strip() + '\r\n\r\n')

    def GetBook(self):
    	pattern = re.compile(u'.*?.*?'
            + u'.*?(.*?).*?'
            + u'(.*?)
', re.S) try: while self.tagIndex < 2: fileName = "book" + tags[self.tagIndex] + ".txt" fileName.decode("utf-8").encode("gb2312") print fileName fileBook = open(fileName, 'w') while self.start < 1: page = self.GetPage() books = re.findall(pattern, page) for book in books: fileBook.write('subject_id: ' + book[0].strip() + '\r\n') self.WriteBookTitle(book[1].strip(), fileBook) self.WriteBookPubInfo(book[2].strip(), fileBook) self.start += 1 fileBook.close() self.start = 0 self.tagIndex += 1 except: print '抓取 ' + tags[self.tagIndex] + ' 第 ' + self.start / 20 + 1 + ' 页失败' def main(self): print '开始抓取图书数据' self.GetBook() print '抓取完毕...' DoubanSpoder = BookSpider() DoubanSpoder.main()

你可能感兴趣的:(其他)