python爬取糗事百科

以下使用面向过程版的代码

impore urllib
import urllib2
import re
page = 1
url = 'http://www.qiushibaike.com/hot/page/'+str(page)
#url = 'http://www.yllin.cn'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
try:
    request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
    content = response.read().decode('utf-8')
#print content
    pattern = re.compile('
([\s\S]+?)<\/span>') items = re.findall(pattern,content) for item in items: print item except urllib2.URLError, e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason

面向对象版

import urllib
import urllib2
import re

class QSBK:
    url ='' 
    headers = ''
    def __init__(self,url,headers):
        self.url = url
        self.headers = headers
    def request(self):
        request = urllib2.Request(url,headers=self.headers)
        response = urllib2.urlopen(request)
        return response
    def decode(self):
        return self.request().read().decode('utf-8')
    
    def solve_data(self):
        pattern = re.compile('
([\s\S]+?)<\/span>') content = self.decode() items = re.findall(pattern,content) return items def print_data(self): data = self.solve_data() for item in data: print item page = 1 url = 'http://www.qiushibaike.com/hot/page/'+str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent':user_agent} test = QSBK(url,headers) test.print_data()

你可能感兴趣的:(python爬取糗事百科)