爬取51job职位信息

讲解:

首先获取一下所有城市对应的key值,找到所有城市所在的json字符串,向json所在的js页面发送请求,获取信息,然后从第一页获取总的页数,然后遍历所有页数,每到新的一页,找到所有职位信息的详情页url,遍历详情页,获取所要的职位信息。

代码:

import sqlite3,re,json
from urllib.request import urlopen, Request, urlretrieve
from urllib.error import HTTPError


def crawler_city_num():
    """
    请求并解析城市编码函数
    :return: 返回一个字典
    """
    # decode()函数默认使用utf8转化字节,但是51job网页采用的是gbk编码() charset属性就是用来设置网页的编码方式的,所以需要使用decode('gbk')将bytes转化成str,与网页编码方式同步!
    js_connect=urlopen('https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js?20180319').read().decode('gbk')
    str_list = js_connect.split('=')
    str_dict = str_list[1].replace(';', '')
    dic = json.loads(str_dict) # 把json字符串转化为字典
    new_dict = {}
    for key, value in dic.items():
        new_dict[value] = key
    return new_dict



class Crawler(object):
    def __init__(self,city,keys,city_dict):
        # 该参数是用于获取用户输入的查询城市
        # 查询职位关键字信息
        # 郑州:170200 北京:010000 上海:020000
        # https://search.51job.com/list/170200%252C010000%252C020000,000000,0000,00,9,99,Python,2,10.html
        city_code=''
        for key in city:
            city_code+=city_dict[key]
            if key!=city[-1]:
                city_code+='%252C'
        self.URL='https://search.51job.com/list/' + city_code + ',000000,0000,00,9,99,' + keys + ',2,{}.html'
    def get_html(self,page_num):
        """
        请求列表页url,获取网页源代码
        :param page_num: 当前页码
        :return: 返回网页源代码,交给下一个函数 进行解析
        """
        list_url=self.URL.format(page_num)
        try:
            response=urlopen(list_url).read().decode('gbk')
        except HTTPError as e:
            print("url: {} error: {}".format(list_url,e))
            return None
        else:
            return response
    def get_page_num(self):
        """
        获取搜索结果的总页数
        :return:
        """
        list_url = urlopen(self.URL.format(1)).read().decode('gbk')
        total_pattern=re.compile(r'共(.*?)页',re.S)
        total_num=int(re.findall(total_pattern,list_url)[0])
        return total_num

    def prase_html(self,html):
        """
        解析列表页源代码,提供每一个职位的详情页地址
        :param html: 列表页源代码
        :return:
        """
        if html:
            detail_url_pattern = re.compile(r'
.*?

.*?', re.S) #获得详情页url detail_url = re.findall(detail_url_pattern, html) return detail_url else: print('None') def get_detail_html(self,detail_url): """ 请求详情页函数 :param detail_url: :return: 返回详情页源代码,交给下一个函数提取详情页的数据 """ try: response=urlopen(detail_url).read().decode('gbk') except HTTPError as e: print("详情页请求异常:url={},error={}".format(detail_url,e)) return None else: return response def parse_detail(self,detail_html,detail_url): """ 提取详情页数据 :return: """ if 'jobs.51job.com' in detail_url: # 大众化的页面结构 pattern=re.compile(r'

.*?

.*?(.*?).*?

.*?.*?.*?

(.*?)
',re.S) result=re.findall(pattern,detail_html)[0] print(result) elif '51rz.51job.com' in detail_url: print('51rz.51job.com',detail_url) else: print('其他',detail_url) if __name__=='__main__': city_dict = crawler_city_num() city=input('请输入查询城市名称(城市间用逗号隔开):') # 类似三位运算符,if条件成立就选前边的情况,否则就是后边的情况 city_list=city.split(',') if ',' in city else [city] kw=input('请输入职位关键字:') obj = Crawler(city=city_list,keys=kw,city_dict=city_dict) total_num=obj.get_page_num() # 获取总页数 for number in range(1,total_num): print('开始获取第{}页数据。。'.format(number)) list_html=obj.get_html(number) detail_urls=obj.prase_html(list_html) print(detail_urls) for detail_url in detail_urls: # 将detail_url交给下一个函数去请求 detail_html= obj.get_detail_html(detail_url) obj.parse_detail(detail_html,detail_url)

你可能感兴趣的:(Python)