爬虫之糗事百科

# -*- coding:utf-8 -*-
import urllib
import urllib2
import re

page = 1
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
try:
    request = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(request)
    content = response.read().decode('utf-8')
    pattern = re.compile('
.*?

(.*?)

.*?"content">.*?(.*?).*?
(.*?)
(.*?)(.*?)', re.S) imgPattern = re.compile('') content = re.sub(replaceBR,"\n",item[1]) if not haveImg: print item[0], content, item[3], item[4] else: imgUrl = re.findall(imgPattern, item[2])[0] print item[0], content, imgUrl, item[3], item[4] except urllib2.URLError, e: if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason

你可能感兴趣的:(爬虫之糗事百科)