Python学习爬虫(9)--实战高阶:爬取豆瓣书名(多线程)

作者:IT小样
以单线程来爬取所有页的豆瓣书名,那么运行时间会非常久,因此考虑使用多线程来增加并发,减少爬取时长。实现方案如下:

import requests
import threading
from bs4 import BeautifulSoup
import queue
import random,time

count_crawel = 3
count_parse = 3

class Thread_Crawel(threading.Thread):
    def __init__(self,url_queue,outcome_queue,number):
        threading.Thread.__init__(self)
        self.url_queue = url_queue
        self.outcome_queue = outcome_queue
        self.number = number
        self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'}
    def run(self):
        print("启动采集线程",self.number)
        while self.url_queue.qsize()>0:
            url = self.url_queue.get()
            print(self.number,"线程采集url:",url)
            time.sleep(random.randint(5,15)/10)
            response= requests.get(url,headers = self.header,verify=False)
            if response.status_code == 200:
                self.outcome_queue = response.text

class  Thread_Parse(threading.Thread):
    def __init__(self,number,outcome_queue,req_thread):
        threading.Thread.__init__(self)
        self.number = number
        self.outcome_queue = outcome_queue
        self.req_thread = req_thread
        self.is_parse = True
    def run(self):
        print("启动解析线程",self.number)
        while True:
            for t in self.req_thread:
                if t.is_alive():
                    break
            else:
                if self.outcome_queue.qsize()==0:
                    self.is_parse = False
            if self.is_parse == True:
                try:
                    data = self.outcome_queue.get(timeout=3)
                except Exception as e:
                    data = None
                if data is not None:
                     self.parse(data)
            else:  
                break
        print("退出解析线程",self.number)

def parse(self,data):
    text = []
    soup = BeautifulSoup(data)
    ul_soup = soup.find(attrs={"class":"subject-list"})
    li_soup = soup.find_all("li",attrs={"class":"subject-item"})
    for li in li_soup:
        result_list = []
        title = li.h2.get_text().replace('  ',' ').replace('\n',' ')
        author = li.find("div",attrs={"class":"pub"}).get_text().replace('  ','').replace('\n','')
        result_list.append(title)
        result_list.append(author)
        text.append(result_list)
    with open(result.txt,"a+",encoding='utf-8') as f:
        for book in text:
            book_author = ' '.join(book)
            f.write(book_author)
            f.write('\n')

def main():
    url_queue = queue.Queue()
    outcome_queue = queue.Queue()
    offset = 20
    url_temp = "https://book.douban.com/tag/%E6%BC%AB%E7%94%BB?start={}&type=T"	
    for i in range(0,10):
        start = offset*i
        url = url_temp.format(start)
        url_queue.put(url)
    req_thread = []
    for i in range(count_crawel):
        t = Thread_Crawel(url_queue,outcome_queue,i+1)
        t.start()
        req_thread.append(t)
    parse_thread = []
    for i in range(count_parse):
        t = Thread_Parse(i+1,outcome_queue,req_thread)
        t.start()
        parse_thread.append(t)
    for t in req_thread:
        t.join()
    for t in parse_thread:
        t.join()

if __name__ ==  "__main__":
    main()

上一篇:实战中阶

你可能感兴趣的:(Python爬虫)