网络爬虫:urllib模块应用7--拉钩

# 目标url: https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
# post请求要提交的表单数据
from urllib import request,parse
import json,pymysql,time



def lagouspider(url,formdata):
# 发起请求返回响应结果
    response_data = load_page_data(url,formdata)
# 得到一个json数据,需返回一个python类型的数据
    data = json.loads(response_data)
    if data['success']:
        print('请求成功')
# 拿到职位信息
        postionJobs  = data['content']['positionResult']['result']
        for jobinfo in postionJobs:
            jobdata = {}
            jobdata['positionName'] = jobinfo['positionName']
            jobdata['publishTime'] = jobinfo['formatCreateTime']
            jobdata['companyname'] = jobinfo['companyShortName']
            jobdata['salary'] = jobinfo['salary']
            jobdata['worYear'] = jobinfo['workYear']
            jobdata['education'] = jobinfo['education']
            jobdata['industry'] = jobinfo['industryField']
            jobdata['stage'] = jobinfo['financeStage']
            jobdata['companySize'] = jobinfo['companySize']
            jobdata['fuli'] = ','.join(jobinfo['companyLabelList'])
            jobdata['positionAdvantage'] = ','.join(jobinfo['positionAdvantage'])
            # 存数据
            save_data_to_db(jobdata)




# 判断是否需要发起下一次请求
        # 取出当前页码
        cur_page = int(data['content']['pageNo'])
        # 每页多少条
        page_size = int(data['content']['pageSize'])
        #职位总数
        totalcount = int(data['content']['positionResult']['totalCount'])
        if cur_page*page_size < totalcount:
            next_page = cur_page +1
            print('继续发起请求第'+str(next_page)+'页')
            formdata['pn'] = next_page
            lagouspider(url,formdata)
    else:
        print('请求不成功,请稍后尝试')
        time.sleep(10)
        print('重新发起第'+formdata['pn']+'页请求')
        lagouspider(url,formdata)
def load_page_data(url,formdata):
    """
    发起请求
    :param url:
    :param formdata:
    :return:
    """
    # 将表单数据转为web服务器可以识别的url编码格式的bytes类型的数据
    form_data  = parse.urlencode(formdata).encode('utf-8')
    req_header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
        'Referer':'https://www.lagou.com/jobs/list_c%2B%2B?labelWords=&fromSearch=true&suginput=',
    }
    # 构建一个request对象
    req = request.Request(url, headers=req_header,data=formdata)

    # 根据Request对象发起请求
    response = request.urlopen(req)
    if response.status == 200:
        return response.read().decode('utf-8')

def save_data_to_db(jobdata):
    """
    存储数据
    :param jobdata:
    :return:
    """
    sql = """
    INSERT INTO lagou(%s)
    VALUE (%s)
    """%(','.join(jobdata.keys()),','.join(['%s']*len(jobdata)))
    try:
        cursor.execute(sql,list(jobdata.values()))
        mysql_client.commit()
    except Exception as err:
        print(err)
        mysql_client.rollback()
if __name__ == '__main__':

    # 创建数据库链接
    """
     host=None, user=None, password="",
                 database=None, port=0, unix_socket=None,
                 charset=''
    """
    mysql_client = pymysql.Connect('127.0.0.1','root','18603503110','1712B',3306,charset='utf8')
    # 创建游标(执行mysql语句)
    cursor = mysql_client.cursor()
    # 目标
    url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

    # 要提交的参数
    formdata = {
        'first':'true',
        'pn':1,
        'kd':'c++',
    }
    lagouspider(url,formdata)


你可能感兴趣的:(网络爬虫:urllib模块应用7--拉钩)