python爬虫爬取前程无忧51job企业招聘信息

# -*- coding:utf-8 -*-
import requests
import re
import random
import time
import pandas as pd
from bs4 import BeautifulSoup
import lxml

from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告

##清除无效字符
def clear(val_list):
    illegal_char = [' ','\n','	','\r','\r','\t','\f']
    for i in illegal_char:
        val = re.sub(i,'',val_list)
    return val




class job(object):
    def __init__(self,url):
        self.url=url
        self.s = requests.session()   ## 创建一个session对象
        headers = {
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':'gzip, deflate, br',
                'Accept-Language':'zh-CN,zh;q=0.9',
                'Cache-Control':'max-age=0',
                'Connection':'keep-alive',
                'Host':'jobs.51job.com',
                'Upgrade-Insecure-Requests':'1',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36',
                   }
        self.s.headers.update(headers)   ### 设置请求头


    def getdata(self):###获取企业内招聘目录

        all=pd.DataFrame()
        for i in range(1,16):
            time.sleep(0.3)  ##延时
            p=str(i)
            ##请求数据
            data={
                'pageno':p,
                'hidTotal':'1060',
                'type':'undefined',
                'code':'undefined',
            }
            
            req=self.s.post(url=self.url,data=data,verify=False).text  
            title=re.findall('title="(.*?)">',req)  #职位
            href=re.findall('href="([https].*?)"',req)  ##链接
            t2=re.findall('class="t2">(.*?)<',req)  ##要求
            t3 = re.findall('class="t3">(.*?)<', req)  ##地区
            t4 = re.findall('class="t4">(.*?)<', req)  ##薪酬
            t5 = re.findall('class="t5">(.*?)<', req)  ##更新日期
            data={
                '职位':title,
                '链接':href,
                '要求': t2,
                '地区': t3,
                '薪酬': t4,
                '更新日期': t5
            }
            df = pd.DataFrame(data)
            all=pd.concat([all,df],ignore_index = True)
        return all

    def getdetails(self,url): ##获取招聘信息详情

        req=self.s.get(url=url,verify=False).text
        html=BeautifulSoup(req, 'lxml')
        tCompany_main=html.find(class_='tCompany_main')
        tBorderTop_box=tCompany_main.find_all(class_='tBorderTop_box')
        if len(tBorderTop_box)!=3:
            job_information = clear(tBorderTop_box[0].get_text().strip())  ##职位信息
            address = clear(tBorderTop_box[1].get_text().strip())  ##联系方式
            departmental = clear(tBorderTop_box[2].get_text().strip())  ##部门信息
            company = clear(tBorderTop_box[3].get_text().strip())  ##公司信息
        else:
            job_information = clear(tBorderTop_box[0].get_text().strip())  ##职位信息
            address = clear(tBorderTop_box[1].get_text().strip())  ##联系方式
            departmental = ''  ##部门信息
            company = clear(tBorderTop_box[2].get_text().strip())  ##公司信息

        return job_information,address,departmental,company

    def load(self):  
        df=self.getdata()
        df.loc[:, "职位信息"] = ''
        df.loc[:, "地址"] = ''
        df.loc[:, "部门信息"] = ''
        urls=df['链接'].values.tolist()
        for i in range(len(urls)):
            details=self.getdetails(urls[i])
            df.at[i, '职位信息'] = details[0]
            df.at[i, '地址'] = details[1]
            df.at[i, '部门信息'] = details[2]

        df.to_excel('51job.xlsx',index=0,encoding="GB18030")


if __name__ == '__main__':
    url='https://jobs.51job.com/all/co4148005.html#syzw'  ##需要爬取的企业链接
    wy=job(url)
    wy.load()

 

你可能感兴趣的:(python,爬虫)