Python爬取拉勾网数据,实现数据保存至文件或数据库

Python爬取拉钩网数据一一一保存数据至文件

 

import requests
import time
import json


def get_data(url,page,lang_name):
    header = {
        'Content-Language': 'zh-CN',
        'Content-Type': 'application/json;charset=UTF-8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
        'Referer': 'https://www.lagou.com/jobs/list_Hadoop?px=default&city=%E5%85%A8%E5%9B%BD'}
    Cookies = {
    'Cookie': '_ga=GA1.2. 1864083170.1542538584; user_trace_token=20181118185622-96224099-eb20-11e8-a648-525400f775ce; LGUID=20181118185622-96224871-eb20-11e8-a648-525400f775ce; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672676ed41274-09ce2d0583d289-3f674604-2073600-1672676ed42739%22%2C%22%24device_id%22%3A%221672676ed41274-09ce2d0583d289-3f674604-2073600-1672676ed42739%22%7D; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E6%B7%B1%E5%9C%B3; WEBTJ-ID=20181207230154-1678930781a414-046c47d4f0aadf-3f674604-2073600-1678930781b154; _gid=GA1.2.538058677.1544194914; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543851889,1543853652,1544080965,1544194915; LGSID=20181207230155-098306d9-fa31-11e8-8ce7-5254005c3644; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xa0da035100015b9c%26issp%3D1%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26rsv_sug3%3D6%26rsv_sug1%3D5%26rsv_sug7%3D100; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; JSESSIONID=ABAAABAAAGGABCB09F792E3C88B0DF4712407BBD57C7D1A; _putrc=3CC2AD8D77BBA3E1; login=true; unick=%E6%9D%A8%E8%83%9C; hasDeliver=390; gate_login_token=c200afceae9db4a3f7c79f4414fb4daccc0054486437ba5b; TG-TRACK-CODE=index_navigation; SEARCH_ID=9d6b719d888940838e12dfaa67393bf8; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1544195025; LGRID=20181207230345-4b26a2a0-fa31-11e8-8ce7-5254005c3644'}

    data = {'first': 'true', 'pn': page, 'kd': lang_name,'city':'全国'}

    repose = requests.post(url,data,headers=header,cookies=Cookies).json()
    list_con = repose['content']['positionResult']['result']
    info_list = []
    for i in list_con:
        info = []
        info.append(i['companyId'])
        info.append(i['companyFullName'])
        info.append(i['companyShortName'])
        info.append(i['companySize'])
        info.append(i['createTime'])
        info.append(i['district'])
        info.append(i['education'])
        info.append(i['financeStage'])
        info.append(i['firstType'])
        info.append(i['industryField'])
        info.append(i['industryLables'])
        info.append(i['jobNature'])
        info.append(i['linestaion'])
        info.append(i['positionName'])
        info.append(i['salary'])
        info.append(i['secondType'])
        info.append(i['workYear'])
        info.append(i['positionAdvantage'])
        info_list.append(info)
        # time.sleep(2)
    return info_list

def main():
    lang_name = ['hadoop','大数据开发','java工程师','ETL','数据分析','BI工程师']
    page = 1
    url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false'
    info_result = []
    while page < 10:
        info = get_data(url, page, lang_name)
        info_result = info_result + info
        page += 1
    #写入lagou.txt文件中
    with open('lagou.txt','w+',encoding='utf-8') as f:
        for row in info_result:
            f.write(str(row)+'\n')


if __name__ == '__main__':
    main()

 

Python爬取拉勾网数据二二二保存数据至MySQL数据库

import requests
import random
import pymysql
import time
import json

count = 0
url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false'
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Origin': 'https://www.lagou.com',
    'Cookie': '_ga=GA1.2.1864083170.1542538584; user_trace_token=20181118185622-96224099-eb20-11e8-a648-525400f775ce; LGUID=20181118185622-96224871-eb20-11e8-a648-525400f775ce; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672676ed41274-09ce2d0583d289-3f674604-2073600-1672676ed42739%22%2C%22%24device_id%22%3A%221672676ed41274-09ce2d0583d289-3f674604-2073600-1672676ed42739%22%7D; index_location_city=%E6%B7%B1%E5%9C%B3; LG_LOGIN_USER_ID=03ba7c800122a93fc9a30c04c636a39968228a2acdbe0721; JSESSIONID=ABAAABAAAGFABEF38A87E81C9125A8B1F61050C1399F77A; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545899342,1546053937,1546054187,1546671741; _gat=1; _gid=GA1.2.1058771725.1546671741; LGSID=20190105150212-d3be7ffe-10b7-11e9-b0eb-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_navigation; LGRID=20190105150303-f1a7cbc3-10b7-11e9-b0eb-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1546671791; SEARCH_ID=64c9cca8fc2744558199fd8f7ed4ab21',
    'Referer': 'https://www.lagou.com/jobs/list_?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}

db = pymysql.connect("192.168.94.150","root","123456","lagou")
def add_mysql(id,companyId,companyFullName,positionId,positionName,workYear,education,salary,jobNature,city,companySize,companyShortName,firstType,secondType,thirdType,createTime,formatCreateTime,financeStage,industryField,positionAdvantage,companyLogo):
    try:
        cursor = db.cursor()
        sql = 'insert into data(id,companyId,companyFullName,positionId,positionName,workYear,education,salary,jobNature,city,companySize,companyShortName,firstType,secondType,thirdType,createTime,formatCreateTime,financeStage,industryField,positionAdvantage,companyLogo) ' \
              'values (%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s)' %(id,companyId,companyFullName,positionId,positionName,workYear,education,salary,jobNature,city,companySize,companyShortName,firstType,secondType,thirdType,createTime,formatCreateTime,financeStage,industryField,positionAdvantage,companyLogo);
        print(sql)
        cursor.execute(sql)
        print(cursor.lastrowid)
        db.commit()
    except Exception as e:
        print(e)
        db.rollback()


def get_message():
    for i in range(1,31):
        print('第'+str(i)+'页')
        time.sleep(random.randint(5,10))
        data = {
            'first': 'true',
            'pn': i,
            'kd': ['hadoop','大数据开发','java工程师','ETL','数据分析','BI工程师'],
            'city': '全国'
        }
        reponse = requests.post(url=url,data=data,headers=headers)
        reponse.encoding = 'utf-8'
        results = json.loads(reponse.text)
        job_message = results['content']['positionResult']['result']
        for job in job_message:
            global count
            count += 1
            companyId = job['companyId']
            companyFullName = job['companyFullName']
            positionId = job['positionId']
            positionName = job['positionName']
            workYear = job['workYear']
            education = job['education']
            salary = job['salary']
            jobNature = job['jobNature']
            city = job['city']
            companySize = job['companySize']
            companyShortName = job['companyShortName']
            firstType = job['firstType']
            secondType = job['secondType']
            thirdType = job['thirdType']
            createTime = job['createTime']
            formatCreateTime = job['formatCreateTime']
            financeStage = job['financeStage']
            industryField = job['industryField']
            positionAdvantage= job['positionAdvantage']
            companyLogo = job['companyLogo']

            print(companyId)
            print(companyFullName)
            print(positionId)
            print(positionName)
            print(workYear)
            print(education)
            print(salary)
            print(jobNature)
            print(city)
            print(companySize)
            print(companyShortName)
            print(firstType)
            print(secondType)
            print(thirdType)
            print(createTime)
            print(formatCreateTime)
            print(financeStage)
            print(industryField)
            print(positionAdvantage+'\n\n')
            print(companyLogo)

            add_mysql(count,companyId,companyFullName,positionId,positionName,workYear,education,salary,jobNature,city,
                      companySize,companyShortName,firstType,secondType,thirdType,createTime,formatCreateTime,
                      financeStage,industryField,positionAdvantage,companyLogo)


if __name__ == '__main__':
    get_message()

 

 

 

 

 

 

 


 

你可能感兴趣的:(python)