爬虫基础2:多线程爬取51job职位

学习记录:
1.requests思维脑图,记录基础用法
2.python多线程threading模块
3.队列模块Queue

爬虫基础2:多线程爬取51job职位_第1张图片

# -*- coding=utf-8 -*-
import time

import requests
import threading
from multiprocessing import Queue
from lxml import etree


class CrawlPage(threading.Thread):
    def __init__(self, page_queue, data_queue, thread_name):
        super(CrawlPage,self).__init__()
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.thread_name = thread_name
        # 默认请求头
        self.header = {
            "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Host": "search.51job.com",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36",
        }

    def run(self):
        print("当前工作线程为:{}".format(self.thread_name))
        # 1.循环取page_queue里的url直到取完 2.requests发送请求返回数据保存到data_queue
        while not page_flag:
            try:
                page = self.page_queue.get(block=False)
                page_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html'.format(page)
                response = requests.get(url=page_url, headers=self.header)
                print("当前爬取的url为:{}".format(page_url),"爬取页面响应状态码为:{}".format(response.status_code))
                response.encoding = 'gbk'
                self.data_queue.put(response.text)
            except:
                pass


class CrawlData(threading.Thread):
    def __init__(self, thread_name, data_queue):
        super(CrawlData, self).__init__()
        self.thread_name = thread_name
        self.data_queue = data_queue

    def run(self):
        print("当前处理文本数据的线程为:{}".format(self.thread_name))
        while not data_flag:
            try:
                print("当前剩余数据量为{}".format(self.data_queue.qsize()))
                text = self.data_queue.get(block=False)
                html = etree.HTML(text)
                all_div = html.xpath("//div[@id='resultList']//div[@class='el']")
                info_list = []
                for item in all_div:
                    info = {}
                    # 获取数据的时候,使用列表索引为0的数据
                    info['job_name'] = item.xpath("./p/span/a/@title")[0]
                    info['company_name'] = item.xpath(".//span[@class='t2']/a/@title")[0]
                    info['company_address'] = item.xpath(".//span[@class='t3']/text()")[0]
                    # money字段可能为空,try-except来进行异常处理
                    try:
                        info['money'] = item.xpath(".//span[@class='t4']/text()")[0]
                    except:
                        info['money'] = '无数据'
                    info['date'] = item.xpath(".//span[@class='t5']/text()")[0]
                    info_list.append(info)
                print("当前处理的线程为:{},解析出的数据为:{}".format(self.thread_name,info_list))
                time.sleep(1)
            except:
                pass


# 设置两个全局标志位,当数据取完时结束while循环
page_flag = False
data_flag = False


def main():
    # 构造存放页码和文本数据队列
    page_queue = Queue()
    data_queue = Queue()

    #存入页码数据
    for page in range(1,10):
        page_queue.put(page)
    print("当前队列中共有页码数为:{}".format(page_queue.qsize()))

    #启动线程爬取页面信息
    global page_flag
    page_thread_name = ['页面爬取1号','页面爬取2号','页面爬取3号']
    page_crawl_list = []
    for page_thread in page_thread_name:
        page_crawl = CrawlPage(page_queue,data_queue,page_thread)
        page_crawl.start()
        page_crawl_list.append(page_crawl)

    # 主线程进行阻塞,直到page_queue里的数据全部取完
    while not page_queue.empty():
        pass
    
    # 当page_queue里的数据全部取完后,将标志位设置为True,结束CrawlPage.run() 方法中的while循环
    page_flag = True
    for page_crwal_join in page_crawl_list:
        page_crwal_join.join()
        print(page_crwal_join.thread_name+"页面爬取工作结束!")
    print("当前data_queue的数据总量为:{}".format(data_queue.qsize()))

    # 设置3个文本处理线程,启动文本处理类
    crawl_thread_name = ["文本处理1号","文本处理2号","文本处理3号"]
    crawl_data_list = []
    for crawl_data_name in crawl_thread_name:
        crawl_data = CrawlData(crawl_data_name, data_queue)
        crawl_data.start()
        crawl_data_list.append(crawl_data)


    # 阻塞主线程,直到data_queue的数据全部取完
    while not data_queue.empty():
        pass

    global data_flag
   # 数据取完后退出线程
    data_flag = True
    for crawl_data_join in crawl_data_list:
        crawl_data_join.join()
        print("线程{}结束".format(crawl_data_join.thread_name))

    # 数据为0,解析html文本数据结束
    print("data_queue的数据为:{}".format(data_queue.qsize()))


if __name__ == '__main__':
    main()

代码是自己写的,参考的是别的学习网站

你可能感兴趣的:(python,多线程)