爬虫分页爬取糗事百科

学完python正则表达,最主要参考了CQC的博客:http://cuiqingcai.com/990.html。


功能:把段子的作者、发表时间、点赞数、内容和配图都给匹配了出来。附加跳转到前一页,后一页,某一页,退出功能。


修改后代码如下(20160220匹配成功):

#!/usr/bin/env python
#-*-coding:utf-8 -*-
__author__ = "PS"
"""
modified from CQC
http://cuiqingcai.com/990.html
python version : 2.7.9 
"""
import urllib
import urllib2
import re
import time
class Scrapy_qiushibaike():
    def __init__(self):
        self.pageIndex = 1
        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
        self.headers  = {'User-Agent':self.user_agent}
        self.stories = []
        self.enable = True
    
    def get_page(self,pageIndex):
        try:
            url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex)
            request = urllib2.Request(url,headers=self.headers)
            response = urllib2.urlopen(request)
            pageCode = response.read().decode('utf-8')
            return pageCode
        
        except urllib2.URLError,e:
            if hasattr(e, "reason"):
                print "connect to the web error",e.reason
                return None
           
    def get_page_items(self,pageIndex):
        page_code = self.get_page(pageIndex)
        if not page_code:
            print "response failure"
            return None
        pattern = re.compile('<div.*?article.*?'+
                             '<h2>(.*?)</h2>.*?' +
                             '<div class="content">(.*?)' +
                             '<!--(.*?)-->.*?'+
                             'div>(.*?)class="stats".*?' +
                             'class="number">(.*?)</i>', re.S)
        #item[0]:name,item[1]:content,item[2]:time,itme[3]:img,item[4]:support number
        items = re.findall(pattern,page_code)
        page_stories = []
        for item in items:
            haveImg = re.search("img",item[3])
            if haveImg:
                pattern_img = re.compile('<img src="(.*?)"')
                img_url = ''.join(re.findall(pattern_img,item[3]))
            else:
                img_url = 'no image'
            replaceBR = re.compile('<br/>')
            text = re.sub(replaceBR,"\n",item[1])
            time_float = time.gmtime(float(item[2]))
            time_formated =  time.strftime('%Y-%m-%d %H:%M:%S',time_float)
            author = item[0]
            support_number = item[4]
            page_stories.append([author.strip(), text.strip(),time_formated.strip(),
                                 img_url,support_number.strip()])
        return page_stories
        
        
        
    def load_page(self):
        if self.enable == True:
            if len(self.stories) <= 2:
                page_stories = self.get_page_items(self.pageIndex)                    
            # add to global variable stories
            if page_stories:
                self.stories.append(page_stories)
                
                
    def get_one_page_story(self):
        self.load_page()
        for story in self.stories[0]:
            print "page%d\nauthor:%s\ntime:%s\nsupport_number:%s\n%s\n%s\n" %(self.pageIndex,story[0],story[2],story[4],story[1],story[3])
        del self.stories[0]      
            
    def start(self):
        while self.enable:
            self.get_one_page_story() 
            input  = raw_input("'n' -> next page, 'p' -> previous page, number -> that page, q/Q -> quit,others -> current page:")
            if input == 'q':
                self.enable = False
                return None
            elif input == 'f':
                self.pageIndex += 1
            elif input == 'b':
                self.pageIndex -= 1
            elif input.isdigit():
                self.pageIndex = int(input)
                print self.pageIndex
          
if __name__ == '__main__':
    spider = Scrapy_qiushibaike()
    spider.start()

        

            


你可能感兴趣的:(python,爬虫,糗百)