python 百度贴吧爬虫(下载图片)

业余时用python写的百度贴吧爬虫程序,算是对学习python程序得一个练习。

本程序可以针对给定的贴吧链接,把帖子楼主的发言或者图片爬取出来,目前主要功能为下载所有楼主发的图片。爬取楼主发言的功能仅支持屏幕输出,没有保存到本地文件,有兴趣的朋友可以进行补充。仅供学习,转载请标明出处。

tieba_spider.py

#coding:utf-8
import urllib2,re,time,threading
import DownQueue

user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36' #模拟浏览器访问
url='http://tieba.baidu.com/p/3271638607?see_lz=1&pn=' #贴吧地址,只看楼主
header={'User-Agent' : user_agent}

g_worker=DownQueue.down() #下载器

class Tieba_Spider(threading.Thread):
    def __init__(self,url,type):
        threading.Thread.__init__(self)
        self.url=url
        self.type=type
        self.num=0
    
    def run(self):
        self.start_spider()
        
    def get_info(self):
        try:
            req=urllib2.Request(self.url,headers=header)
            response=urllib2.urlopen(req)
            htm=response.read().decode('gbk')
            self.num=self.get_page_num(htm)
            print 'It has %d page' % self.num
            self.title=self.get_title(htm)
            print 'It\'s title is %s'%self.title
            
        except urllib2.URLError,e:
            if hasattr(e,'code'):
                print 'Error code :',e.code              
            if hasattr(e,'reason'):
                print 'Reason :',e.reason
                
    def start_spider(self):
        global g_worker
        self.get_info()
        
        for i in range(1,self.num+1,1):
            print 'start : ',i
            try:
                req=urllib2.Request(self.url+str(i),headers=header)
                response=urllib2.urlopen(req)
                htm=response.read().decode('gbk')
                if self.type==0:
                    self.page_deal(htm)
                elif self.type==1:
                    self.down_pic(htm)
                
            except urllib2.URLError,e:
                if hasattr(e,'code'):
                    print 'Error code :',e.code              
                if hasattr(e,'reason'):
                    print 'Reason :',e.reason
                    
        g_worker.set_flag(True)
 
    def get_page_num(self,htm):
        
        match=re.search(r'<span class="red">(\d*)</span>',htm)
        if match:
            return int(match.group(1))
        else:
            return 0
        
    def get_title(self,htm):
        
        match=re.search(r'class="core_title_txt(\s+)"(\s+)title="(.*?)"',htm)
        if match:
            return match.group(3)
             
        else:
            print 'no match title'
            return ''
        
    def page_deal(self,htm):
        match=re.findall(r'id="post_content_(.*?)">(.*?)</div>',htm)
        if match:
            for it in match:
                print it[1],'\n'
        else:
            print 'no deal'
            
    def down_pic(self,htm):
        global g_worker
        match=re.findall(r'<img class="BDE_Image" pic_type=(.*?)src="(.*?)"',htm)
        if match:
            for it in match:
                print 'picture url :',it[1],'\n'
                g_worker.push(it[1])
        else:
            print 'no deal'        
        
if __name__=='__main__':
    
    spider=Tieba_Spider(url,1)#参数1为下载图片。默认为0,功能为抓取楼主的发言在屏幕上显示
    spider.start()

    g_worker.start()

DownQueue.py

#coding:utf-8
import threading,Queue,re,time
import urllib2

class down(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.queue=Queue.Queue(1000)
        self.semaphore=threading.Semaphore(0)
        self.flag=False #是否停止
    
    def push(self,obj):
        self.queue.put(obj)
        self.semaphore.release()
        
    def set_flag(self,f):
        self.flag=f
        
    def run(self):
        while True:
            if self.semaphore.acquire():
                obj=self.queue.get()
                data=urllib2.urlopen(obj).read()
                pic=re.search(r'.*/(.*)',obj)
                
                print 'dowing ',pic.group(1)
                fd=open('./spider_pic/%s'%pic.group(1),'wb')
                fd.write(data)
                fd.close()
                
            if self.queue.empty() and self.flag:    #线程结束条件,队列为空并且退出标志为真
                break

Tieba_Spider 类为爬虫类,负责爬出楼主发言中的图片链接,并将其推入down类的队列中。down类的工作为下载图片。两个类均继承自threading.Thread。仅供学习,转载请标明出处。

你可能感兴趣的:(爬虫,python,图片)