python爬虫实战之多线程爬取前程无忧简历

python爬虫实战之多线程爬取前程无忧简历

import requests
import re
import threading
import time
from queue import Queue

HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}


# 自定义线程--生产者
class Procuder(threading.Thread):
    # 初始化,传入url和保存数据的队列
    def __init__(self, pageurl_queue, jobinfo_queue, *args, **kwargs):
        super(Procuder, self).__init__(*args, **kwargs)
        self.pageurl_queue = pageurl_queue
        self.jobinfo_queue = jobinfo_queue

    # 重写run方法
    def run(self):
        while True:
            # 当url队列为空时,退出循环
            if self.pageurl_queue.empty():
                break
            # 获取队列中的url
            url = self.pageurl_queue.get()
            # 调用解析url函数
            self.parse_page(url)

    # 解析url函数
    def parse_page(self, url):
        # 模拟请求,并以gbk编码返回
        resp = requests.get(url, headers=HEADERS)
        resp.encoding = "gbk"
        text = resp.text
        # 通过正则爬取每条职位的url
        jobs_url = re.findall('
.*?', text, re.DOTALL) for x in jobs_url: # 调用解析每条职位的url函数 self.parse_job_info(x) # 解析每条职位的url函数 def parse_job_info(self, url): # 模拟请求,并以gbk编码返回 resp = requests.get(url, headers=HEADERS) resp.encoding = "gbk" text = resp.text # 通过正则爬取想要的信息 info = re.findall(r'

0: all_info = re.sub(" ", "", info[0]) infos = all_info.split("|") if len(infos) >= 5 and infos[4].find("发布") >= 0: jobname = re.findall(r'

.*?>(.*?).*?title="(.*?)"', text, re.DOTALL)[0] if companyname == "": companyname = "null" companytype = re.findall(r'
.*?title="(.*?)"', text, re.DOTALL)[0] if companysize == "": companysize = "null" companysalary = re.findall(r'
.*?(.*?)<', text, re.DOTALL)[0] if companysalary == "": companysalary = "null" companycity = infos[0] workingExp = infos[1] edulevel = infos[2] needperson = infos[3] createdata = infos[4] welfare = re.findall(r'
(.*?)", "/", welfare) welfare = re.sub("//", ",", welfare) welfare = re.sub("/", "", welfare) welfare = re.sub("\r", "", welfare) # 将爬取内容存储到数据的队列 self.jobinfo_queue.put((jobname, companyname, companytype, companysize, companycity, companysalary, edulevel, workingExp, welfare, needperson, createdata)) # 自定义线程--消费者 class Consumer(threading.Thread): # 初始化,传入url和保存数据的队列 def __init__(self, pageurl_queue, jobinfo_queue, *args, **kwargs): super(Consumer, self).__init__(*args, **kwargs) self.pageurl_queue = pageurl_queue self.jobinfo_queue = jobinfo_queue # 重写run方法 def run(self): while True: # 当url队列和存储数据队列为空时,退出循环 if self.jobinfo_queue.empty() and self.pageurl_queue.empty(): break # "jobname","companyname","companytype","companysize","companycity","companysalary","edulevel","workingExp","welfare","needperson","createdata" values = self.jobinfo_queue.get() # 追加写入到文本中 with open("qcwy.txt", "a+", encoding="utf-8", newline="") as f: f.write(values[0] + '\001' + values[1] + '\001' + values[2] + '\001' + values[3] + '\001' + values[ 4] + '\001' + values[5] + '\001' + values[6] + '\001' + values[7] + '\001' + values[8] + '\001' + values[9] + '\001' + values[10] + "\n") print("完成") # 根据url返回页数 def return_pages(url): resp = requests.get(url, headers=HEADERS) resp.encoding = "gbk" text = resp.text page = re.findall('
.*? / (.*?)<', text, re.DOTALL)[0] return page.strip() def main(): # 创建队列 pageurl_queue = Queue(200000) jobinfo_queue = Queue(200000) start_url = "https://search.51job.com/list/{},000000,0000,00,9,99,%2520,2,1.html" info_url = "https://search.51job.com/list/{},000000,0000,00,9,99,%2520,2,{}.html" #相关省份和程式码 city_code = ['010000', '020000', '030000', '050000', '060000', '070000', '080000', '090000', '100000', '110000', '120000', '130000', '140000', '150000', '160000', '170000', '180000', '190000', '200000', '210000', '220000', '230000', '240000', '250000', '260000', '270000', '280000', '290000', '300000', '310000', '320000', '110200', '030200', '040000', '080200', '180200', '200200', '070200', '090200', '030800', '230300', '230200', '080300', '170200', '070300', '250200', '190200', '150200', '120300', '120200', '220200', '240200'] #获取每条职位对应的url放入队列中 for x in city_code: for y in range(1, int(return_pages(start_url.format(x))) + 1): u = info_url.format(x, y) pageurl_queue.put(u) #循环开启生产者线程 100=100线程 for x in range(100): t = Procuder(pageurl_queue, jobinfo_queue) t.start() time.sleep(8) #循环开启消费者线程 100=100线程 for x in range(100): t = Consumer(pageurl_queue, jobinfo_queue) t.start() if __name__ == '__main__': main()

你可能感兴趣的:(python爬虫)