换坑季-51Job前程无忧 Python爬虫

写了个简易的Python爬虫,实现对目的工作的分析。
说明,只用了正则re库进行数据处理,requests进行请求,开了4个简易的函数线程。
url是以下界面的url:
换坑季-51Job前程无忧 Python爬虫_第1张图片
主要实现了以下CSV功能:
换坑季-51Job前程无忧 Python爬虫_第2张图片
全部代码:

import requests
import re
import csv
from threading import Thread


def req(i):
    count = 1
    try:
        for url in i:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
            response = requests.get(url=url, headers=headers, timeout=5)
            response.encoding = 'GBK'
            content = re.findall(r'.*?.*?.*?(.*?).*?(.*?).*?(.*?)', response.text, re.S)
            # print(content)
            for index in content:
                txt = []
                title = index[0]
                thisUrl = index[1]
                try:
                    thisContent = requests.get(url=thisUrl, headers=headers, timeout=5)
                    text = re.findall(r'
(.*?)
', thisContent.text, re.S) final = ''.join(text).replace('\r\n\t\t\t\t\t\t', '').replace('

', '').replace('

', '').replace('', '').replace('', '').replace('
', '').replace(' ', '').replace('
', '').replace('
', '').replace('\t\t\t\t\t\t\t\t\t\t\t\t', '').replace('', '').replace('', '').replace('', '').replace('', '').replace('', '').replace('
  • ', '') company = index[2] area = index[3] salary = index[4] date = index[5] txt.append(title) txt.append(thisUrl) txt.append(final) txt.append(company) txt.append(area) txt.append(salary) txt.append(date) # print(txt) with open(r'./前程无忧.csv', 'a', newline='', encoding='utf-8-sig') as csvf: spanwriter = csv.writer(csvf) spanwriter.writerow(txt) count = count + 1 except: print("此次请求详情失败!!!" + thisUrl) except: print('请求首页失败!' + i) print("共%d多少条信息~" % count) if __name__ == '__main__': txt1 = [] txt2 = [] txt3 = [] txt4 = [] url = 'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BF%2590%25E7%25BB%25B4%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' for i in range(1, 6): i = url.format(i) txt1.append(i) for j in range(6, 12): j = url.format(j) txt2.append(j) for k in range(12, 18): k = url.format(k) txt3.append(k) for k in range(18, 24): k = url.format(k) txt3.append(k) t1 = Thread(target=req, args=(txt1,)) t1.start() print('t1线程开始!') t2 = Thread(target=req, args=(txt2,)) t2.start() print('t2线程开始!') t3 = Thread(target=req, args=(txt3,)) t3.start() print('t3线程开始!') t4 = Thread(target=req, args=(txt4,)) t4.start() print('t4线程开始!')
  • 上面这个开了4个线程的代码其实对爬虫还是不太友好。
    以下代码可以在详细页进行爬虫,建议使用生产者消费者模式。

    import re
    import requests
    
    
    url = 'https://jobs.51job.com/chengdu-jjq/114069603.html?s=01&t=0'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
    response = requests.get(url=url, headers=headers, timeout=5)
    response.encoding = 'GBK'
    # 工作职位
    job = re.findall(r'

    ', response.text) print(job) # 薪资 salary = re.findall(r'

    .*?.*?(.*?)', response.text, re.S) print(salary) # 公司名称 company = re.findall(r'target="_blank" title="(.*?)" class="catn">', response.text) print(company) # 公司性质 flag = re.findall(r'

    .*?

    ', response.text) print(flag) # 公司规模 people = re.findall(r'

    .*?

    ', response.text) print(people) # 职位详细的内容: content = re.findall(r'

    .*?

    ', response.text, re.S) # 将正则匹配的内容进行字符串处理 content_str = ''.join(content) # 城市 txt_city = re.findall(r'.*?(成都.*?) .*?', content_str) print(txt_city) # 招多少人 txt_count = re.findall(r'.*?(招\d人).*?', content_str) print(txt_count) # 经验 txt_experience = re.findall(r'.*?(无工作经验).*?', content_str) if len(txt_experience) == 0: txt_experience = re.findall(r'.*?(\d+年经验).*?', content_str) print(txt_experience) # 发布日期 txt_date = re.findall(r'.*?(\d+-\d+发布).*?', content_str) print(txt_date) # 学历要求, 只匹配了大专和本科 txt_education = re.findall(r'.*?(本科).*?', content_str) if len(txt_education) == 0: txt_education = re.findall(r'.*?(大专).*?', content_str) else: txt_education = '无学历要求' print(txt_education) # 职位招聘要求内容描述: descrition = re.findall(r'
    (.*?)
    ', response.text, re.S) descrition = ''.join(descrition).replace('\r\n\t\t\t\t\t\t', '').replace('

    ', '').replace('

    ', '').replace('', '').replace('', '').replace('
    ', '').replace(' ', '').replace('
    ', '').replace('
    ', '').replace('\t\t\t\t\t\t\t\t\t\t\t\t', '').replace('', '').replace('', '').replace('', '').replace('', '').replace('', '').replace('
  • ', '') print(descrition)
  • 你可能感兴趣的:(爬虫)