业余时用python写的百度贴吧爬虫程序,算是对学习python程序得一个练习。
本程序可以针对给定的贴吧链接,把帖子楼主的发言或者图片爬取出来,目前主要功能为下载所有楼主发的图片。爬取楼主发言的功能仅支持屏幕输出,没有保存到本地文件,有兴趣的朋友可以进行补充。仅供学习,转载请标明出处。
tieba_spider.py
#coding:utf-8 import urllib2,re,time,threading import DownQueue user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36' #模拟浏览器访问 url='http://tieba.baidu.com/p/3271638607?see_lz=1&pn=' #贴吧地址,只看楼主 header={'User-Agent' : user_agent} g_worker=DownQueue.down() #下载器 class Tieba_Spider(threading.Thread): def __init__(self,url,type): threading.Thread.__init__(self) self.url=url self.type=type self.num=0 def run(self): self.start_spider() def get_info(self): try: req=urllib2.Request(self.url,headers=header) response=urllib2.urlopen(req) htm=response.read().decode('gbk') self.num=self.get_page_num(htm) print 'It has %d page' % self.num self.title=self.get_title(htm) print 'It\'s title is %s'%self.title except urllib2.URLError,e: if hasattr(e,'code'): print 'Error code :',e.code if hasattr(e,'reason'): print 'Reason :',e.reason def start_spider(self): global g_worker self.get_info() for i in range(1,self.num+1,1): print 'start : ',i try: req=urllib2.Request(self.url+str(i),headers=header) response=urllib2.urlopen(req) htm=response.read().decode('gbk') if self.type==0: self.page_deal(htm) elif self.type==1: self.down_pic(htm) except urllib2.URLError,e: if hasattr(e,'code'): print 'Error code :',e.code if hasattr(e,'reason'): print 'Reason :',e.reason g_worker.set_flag(True) def get_page_num(self,htm): match=re.search(r'<span class="red">(\d*)</span>',htm) if match: return int(match.group(1)) else: return 0 def get_title(self,htm): match=re.search(r'class="core_title_txt(\s+)"(\s+)title="(.*?)"',htm) if match: return match.group(3) else: print 'no match title' return '' def page_deal(self,htm): match=re.findall(r'id="post_content_(.*?)">(.*?)</div>',htm) if match: for it in match: print it[1],'\n' else: print 'no deal' def down_pic(self,htm): global g_worker match=re.findall(r'<img class="BDE_Image" pic_type=(.*?)src="(.*?)"',htm) if match: for it in match: print 'picture url :',it[1],'\n' g_worker.push(it[1]) else: print 'no deal' if __name__=='__main__': spider=Tieba_Spider(url,1)#参数1为下载图片。默认为0,功能为抓取楼主的发言在屏幕上显示 spider.start() g_worker.start()
#coding:utf-8 import threading,Queue,re,time import urllib2 class down(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.queue=Queue.Queue(1000) self.semaphore=threading.Semaphore(0) self.flag=False #是否停止 def push(self,obj): self.queue.put(obj) self.semaphore.release() def set_flag(self,f): self.flag=f def run(self): while True: if self.semaphore.acquire(): obj=self.queue.get() data=urllib2.urlopen(obj).read() pic=re.search(r'.*/(.*)',obj) print 'dowing ',pic.group(1) fd=open('./spider_pic/%s'%pic.group(1),'wb') fd.write(data) fd.close() if self.queue.empty() and self.flag: #线程结束条件,队列为空并且退出标志为真 break