Thread

队列
线程锁
多线程，线程池

队列

import queue

# 创建一个队列(FIFO)
# maxsize:允许存储的最大的量

queue_obj = queue.Queue(maxsize=40)

for i in range(0,40):
    queue_obj.put(i)

#取值
print(queue_obj.get())
#获取队列大小
print(queue_obj.qsize())
#判断队列长度
print(queue_obj.full())
#判断队列是否为空
print(queue_obj.empty())

#!!!对列线程安全的,list,dict是非线程安全的(最好使用线程锁)

多线程爬虫示例

多线程

import queue,threading,requests,json
from lxml import etree
from fake_useragent import UserAgent


def download_article_list_url(task_queue,data_queue):
    '''根据分页url获取分页中的文章列表'''
    req_header = {'User-Agent':UserAgent().random}
    while not take_queue.empty():
        req_url = take_queue.get()
        response = requests.get(url=req_url,headers=req_header)
        if response.status_code==200:
            #页面源码储存到数列
            print(response.url,'获取成功')
            data_queue.put(response.text)

def parse_data_by_queue(data_queue,threadLock):
    '''解析数据队中的页面源码'''
    while not data_queue.empty():
        html = data_queue.get()
        html_element = etree.HTML(html)
        #获取文章列表
        article_divs = html_element.xpath('//div[@class="post floated-thumb"]')
        for article_div in article_divs:
            article_dict = {}
            #info信息储存
            article_dict['title'] = article_div.xpath('.//a[@class="archive-title"]/text()')[0]
            article_dict['publish'] = article_div.xpath('.//div[@class="post-meta"]/p[1]/text()')[0]
            article_dict['catotry'] = article_div.xpath('.//div[@class="post-meta"]/p[1]/a[2]/text()')[0]
            commentnum = article_div.xpath('.//div[@class="post-meta"]/p[1]/a[3]/text()')
            if len(commentnum) > 0:
                article_dict['commentnum'] = commentnum[0]
            article_dict['content'] = article_div.xpath('.//span[@class="excerpt"]/p/text()')[0]
            #将数据写入本地:
            threadLock.acquire()
            with open('jobboleartcle.json','a+') as file:
                print('正在写入',article_dict['title'],threading.currentThread().name)
                json_str = json.dumps(article_dict,ensure_ascii=False)
                file.write(json_str+'\n')
            threadLock.release()

if __name__=='__main__':
    take_queue = queue.Queue()
    data_queue = queue.Queue()
    #锁
    threadLock = threading.Lock()
    #创建任务队列
    for i in range(1,30):
        full_url = 'http://blog.jobbole.com/all-posts/page/{}/'.format(i)
        take_queue.put(full_url)
    #-------------
    #创建爬取线程(4个)
    crawlThreadName = ['崔一','崔二','崔三','崔四']
    crawlThread = []
    for name in crawlThreadName:
        #将队列放入
        thread = threading.Thread(target=download_article_list_url,name=name,args=(take_queue,data_queue))
        thread.start()
        crawlThread.append(thread)
    #添加join,为了让任务队列里所有请求都执行完毕
    for thread in crawlThread:
        #添加join() #timeout 参数的目的是设置阻塞时间
        thread.join()
    #-------------
    parseThreadName = ['崔1','崔2','崔3','崔4']
    parseThread = []
    for name in parseThreadName:
        thread = threading.Thread(target=parse_data_by_queue,name=name,args=(data_queue,threadLock))
        thread.start()
        parseThread.append(thread)

    for thread in parseThread:
        thread.join()

    print('Spider_over')
    print('Main_Thread_over')

自定义线程

import queue,threading,requests,json
from lxml import etree
from fake_useragent import UserAgent

class CrawlThreadt(threading.Thread):
    def __init__(self,task_queue,data_queue,threadName):
        super(CrawlThreadt,self).__init__()
        self.task_queue = task_queue
        self.data_queue = data_queue
        self.threadName = threadName
        self.header = {'User-Agent':UserAgent().random}
    def run(self):
        while not self.task_queue.empty():
            print(self.threadName,'正在下载')
            req_url  = self.task_queue.get()
            response = requests.get(url=req_url,headers=self.header)
            if response.status_code==200:
                print(response.url,'获取成功')
                self.data_queue.put(response.text)

class ParserThreadt(threading.Thread):
    def __init__(self,data_queue,threadLock,threadName):
        super(ParserThreadt,self).__init__()
        self.data_queue = data_queue
        self.threadLock = threadLock
        self.threadName = threadName
    def run(self):
        while not self.data_queue.empty():
            print(self.threadName,'正在解析')
            html = self.data_queue.get()
            html_element =etree.HTML(html)
            article_divs = html_element.xpath('//div[@class="post floated-thumb"]')
            for article_div in article_divs:
                #info
                article_dict = {}
                article_dict['title'] = article_div.xpath('.//a[@class="archive-title"]/text()')[0]
                article_dict['publish'] = article_div.xpath('.//div[@class="post-meta"]/p[1]/text()')[0]
                article_dict['catotry'] = article_div.xpath('.//div[@class="post-meta"]/p[1]/a[2]/text()')[0]
                commentnum = article_div.xpath('.//div[@class="post-meta"]/p[1]/a[3]/text()')
                if len(commentnum) > 0:
                    article_dict['commentnum'] = commentnum[0]
                article_dict['content'] = article_div.xpath('.//span[@class="excerpt"]/p/text()')[0]
                # 加锁
                self.threadLock.acquire()
                with open('jobboleartcl2e.json', 'a+') as file:
                    print('正在写入', article_dict['title'], threading.currentThread().name)
                    json_str = json.dumps(article_dict, ensure_ascii=False)
                    file.write(json_str + '\n')
                self.threadLock.release()

if __name__ == '__main__':
    task_queue = queue.Queue()
    data_queue = queue.Queue()

    for i in range(1,100):
        full_url = 'http://blog.jobbole.com/all-posts/page/{}/'.format(i)
        task_queue.put(full_url)
    #-------------
    crawlThreadName = ['吕1','吕2','吕3','吕4']
    crawlThread = []
    for name in crawlThreadName:
        thread = CrawlThreadt(task_queue,data_queue,name)
        thread.start()
        crawlThread.append(thread)
    for thread in crawlThread:
        thread.join()
    #-------------------
    threadLock = threading.Lock()
    #-------------------
    parseThreadName = ['娜1','娜2','娜3','娜4']
    parseThread = []
    for name in parseThreadName:
        thread = ParserThreadt(data_queue,threadLock,name)
        thread.start()
        parseThread.append(thread)
    for thread in parseThread:
        thread.join()

    print('over')
    print('main_over')

线程池

#线程池
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
from fake_useragent import UserAgent
import threading,requests,json

def download_list(req_url):
    print(req_url,threading.currentThread().name)
    req_header = {'User-Agent':UserAgent().random}
    response = requests.get(url=req_url,headers=req_header)
    if response.status_code==200:
        #请求成功,做数据解析
    #-------------
        return response.text,response.status_code

def download_done(future):
    html = future.result()[0]
    code = future.result()[1]
    #-------------
    print(html,code)
    #做数据的解析操作
    html_element = etree.HTML(html)
    article_divs = html_element.xpath('//div[@class="post floated-thumb"]')
    for article_div in article_divs:
        article_dict = {}
        #info
        article_dict['title'] = article_div.xpath('.//a[@class="archive-title"]/text()')[0]
        article_dict['catotry'] = article_div.xpath('.//div[@class="post-meta"]/p[1]/a[2]/text()')[0]
        commentnum = article_div.xpath('.//div[@class="post-meta"]/p[1]/a[3]/text()')
        if len(commentnum) > 0:
            article_dict['commentnum'] = commentnum[0]
        article_dict['content'] = article_div.xpath('.//span[@class="excerpt"]/p/text()')[0]
        # 将数据写入本地：
        with open('jobboleartcle2.json', 'a+') as file:
            print('正在写入', article_dict['title'], threading.currentThread().name)
            json_str = json.dumps(article_dict, ensure_ascii=False)
            file.write(json_str + '\n')

if __name__ == '__main__':
    # 创建进程池 max_workers 线程池中最大的线程数量
    #-------------
    pool = ThreadPoolExecutor(max_workers=10)
    for i in range(1,50):
        full_url = 'http://blog.jobbole.com/all-posts/page/{}/'.format(i)
        handler = pool.submit(download_list,full_url)
        handler.add_done_callback(download_done)
    #内部join( )方法
    pool.shutdown()
    #-------------
    print('爬虫结束')
    print('主线程结束')

Thread

队列

多线程爬虫示例

多线程

自定义线程

线程池

你可能感兴趣的:(Thread)