爬虫 腾讯招聘-多线程

# 生产者消费者模式
# 队列
import threading,requests
from queue import Queue
import pymongo

# 创建生产者类,作用:访问页面,获取数据
class Productor(threading.Thread):
    # 定义初始化函数
    def __init__(self,page_queue,data_queue):
        # 处理父类init
        threading.Thread.__init__(self)
        self.page_queue = page_queue
        self.data_queue = data_queue

    # 重写run方法
    def run(self):
        # 为了保证队列中的URL能够全部取出,写一个死循环
        while True:
            # 判断退出循环条件
            if self.page_queue.empty():
                break
            # 取出URL
            url = self.page_queue.get()
            # 请求并获取数据
            self.get_content(url)

    # 定义获取数据函数
    def get_content(self,url):
        response = requests.get(url=url, headers=headers)
        Posts = response.json()['Data']['Posts']
        for post in Posts:
            dic = {
     }
            # 获取岗位名
            RecruitPostName = post['RecruitPostName']
            # 获取事业群
            syq = post['BGName']
            # 获取地点
            LocationName = post['LocationName']
            # 获取发布日期
            LastUpdateTime = post['LastUpdateTime']
            dic['RecruitPostName'] = RecruitPostName
            dic['syq'] = syq
            dic['LocationName'] = LocationName
            dic['LastUpdateTime'] = LastUpdateTime
            # print(dic)
            # 保存数据到队列中
            self.data_queue.put(dic)

# 创建消费者类,作用:从队列中取出数据,并保存
class Consumer(threading.Thread):
    # 定义初始化函数
    def __init__(self,data_queue,page_queue):
        threading.Thread.__init__(self)
        self.data_queue = data_queue
        self.page_queue = page_queue

    # 重写run方法
    def run(self):
        # 为了保证能够一直取数据,写死循环
        while True:
            # 判断循环终止条件
            if self.data_queue.empty() and self.page_queue.empty() and switch==1:
                break
            try:
                data = self.data_queue.get(timeout=10)
                print(data)
                self.save(data)
            except:
                break

    # 定义保存数据函数
    def save(self,data):
        # 插入数据
        col.insert(data)
        pass
    def __del__(self):
        client.close()
        pass

if __name__ == '__main__':
    # 定义开关
    switch = 0
    # 建立连接
    client = pymongo.MongoClient(host='127.0.0.1', port=27017)

    # 进入数据库
    db = client['tencent']
    # 进入集合
    col = db['zhaopin']

    # 定义请求头字典
    headers = {
     
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
    }
    # 定义基础URL
    base_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?pageIndex={}&pageSize=10'

    # 定义保存数据的队列
    data_queue = Queue(1000)

    # 定义存放URL的队列
    page_queue = Queue(100)

    # 将URL全部放入page_queue中
    for i in range(1,51):
        page_queue.put(base_url.format(i))

    p_list = []
    # 创建三个生产者线程
    for i in range(3):
        p = Productor(page_queue,data_queue)
        p.start()
        p_list.append(p)

    # 创建三个消费者线程
    for i in range(3):
        c = Consumer(data_queue,page_queue)
        c.start()

    for p in p_list:
        p.join()

    switch = 1
    pass

你可能感兴趣的:(python,爬虫)