【Python爬虫练习】虎扑社区步行街版块首页信息爬取多线程版本

 

#_*_ coding=UTF-8 _*_
import requests
from bs4 import BeautifulSoup
import queue as Queue
import threading

headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'}
link='https://bbs.hupu.com/bxj'
re=requests.get(link,headers=headers)
soup=BeautifulSoup(re.content,'lxml')
soup_list=soup.find('ul',class_='for-list')

work=Queue.Queue(130)

li_list=soup_list.find_all('li')
for lli in li_list:
    work.put(lli)  

class myThread(threading.Thread):
    def __init__(self,name,q):
        threading.Thread.__init__(self)
        self.name=name
        self.q=q
    def run(self):
        gLock.acquire()
        print('starting '+self.name)
        gLock.release()
        while not self.q.empty():
            try:
                crawler(self.name,self.q)
            except:
                break
        print('exit '+self.name)
        
gLock = threading.Lock()#申请锁
def crawler(threadName,q):
    
    if gLock.acquire(1):#加锁,并判断是否得到了锁,得到了再继续        
        lli=q.get()
        try:
            title=lli.find('div',class_='titlelink box')
            author=lli.find('div',class_='author box')
            reply_view=lli.find('span',class_='ansour box')
            reply_view=reply_view.text.strip().split('/')#重点:对形如23/34这样的字符串进行分割
            endreply=lli.find('div',class_='endreply box')

            print('标    题:'+title.a.string)#重点:NavigableString对象获取文本内容的方法
            print('标题链接:'+"https://bbs.hupu.com"+title.a['href'])#重点:获取标签内链接的方法
            print('作    者:'+author.a.string)
            print('发布时间:'+author.contents[5].string)
            print('回    复:'+reply_view[0].strip())#strip()不可省略,否则会有空格
            print('浏    览:'+reply_view[1].strip())
            print('最后回复时间:'+endreply.a.string)
            print('最后回复链接:'+"https://bbs.hupu.com/"+endreply.a['href'])
            print('最后回复人  :'+endreply.span.string)
            print()
            gLock.release()
        except:
            print(self.name,'ERROR:',e)

threadlist=['thread1','thread2','thread3','thread4','thread5']
 
threads=[]

for tname in threadlist:
    thread=myThread(tname,work)
    thread.start()
    threads.append(thread) 

 

你可能感兴趣的:(爬虫,Python)