python 爬取豆瓣某一主题书单_Python爬取豆瓣的各分类书单以及近期热门电影和top250的电影...

pachon2.5.py

# -*- coding: utf-8 -*-

import urllib

import urllib2

import re

import sys

# reload(sys)

# sys.setdefaultencoding('utf-8')

class book: #豆瓣书籍的类

def __init__(self, types, page):

self.baseUrl = 'http://www.douban.com/tag/'

self.types = types

self.filename = 'doubanbook.txt'

self.page = page

def getContents(self): #爬取源代码

try:

#if self.page == 0:

url = self.baseUrl + self.types + '/book'

#else:

#url = self.baseUrl + self.types + '/book?start=' + str(self.page)

user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.65 Safari/537.36'

headers = { 'User-Agent' : user_agent}

request = urllib2.Request(url,headers = headers)

response = urllib2.urlopen(request)

content = response.read() #.decode('utf-8')

pattern = re.compile('(.*?).*?>(.*?)

',re.S)

你可能感兴趣的:(python,爬取豆瓣某一主题书单)