Python爬取拉钩网数据一一一保存数据至文件
import requests
import time
import json
def get_data(url,page,lang_name):
header = {
'Content-Language': 'zh-CN',
'Content-Type': 'application/json;charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_Hadoop?px=default&city=%E5%85%A8%E5%9B%BD'}
Cookies = {
'Cookie': '_ga=GA1.2. 1864083170.1542538584; user_trace_token=20181118185622-96224099-eb20-11e8-a648-525400f775ce; LGUID=20181118185622-96224871-eb20-11e8-a648-525400f775ce; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672676ed41274-09ce2d0583d289-3f674604-2073600-1672676ed42739%22%2C%22%24device_id%22%3A%221672676ed41274-09ce2d0583d289-3f674604-2073600-1672676ed42739%22%7D; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E6%B7%B1%E5%9C%B3; WEBTJ-ID=20181207230154-1678930781a414-046c47d4f0aadf-3f674604-2073600-1678930781b154; _gid=GA1.2.538058677.1544194914; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543851889,1543853652,1544080965,1544194915; LGSID=20181207230155-098306d9-fa31-11e8-8ce7-5254005c3644; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E9%2592%25A9%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xa0da035100015b9c%26issp%3D1%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26rsv_sug3%3D6%26rsv_sug1%3D5%26rsv_sug7%3D100; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; JSESSIONID=ABAAABAAAGGABCB09F792E3C88B0DF4712407BBD57C7D1A; _putrc=3CC2AD8D77BBA3E1; login=true; unick=%E6%9D%A8%E8%83%9C; hasDeliver=390; gate_login_token=c200afceae9db4a3f7c79f4414fb4daccc0054486437ba5b; TG-TRACK-CODE=index_navigation; SEARCH_ID=9d6b719d888940838e12dfaa67393bf8; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1544195025; LGRID=20181207230345-4b26a2a0-fa31-11e8-8ce7-5254005c3644'}
data = {'first': 'true', 'pn': page, 'kd': lang_name,'city':'全国'}
repose = requests.post(url,data,headers=header,cookies=Cookies).json()
list_con = repose['content']['positionResult']['result']
info_list = []
for i in list_con:
info = []
info.append(i['companyId'])
info.append(i['companyFullName'])
info.append(i['companyShortName'])
info.append(i['companySize'])
info.append(i['createTime'])
info.append(i['district'])
info.append(i['education'])
info.append(i['financeStage'])
info.append(i['firstType'])
info.append(i['industryField'])
info.append(i['industryLables'])
info.append(i['jobNature'])
info.append(i['linestaion'])
info.append(i['positionName'])
info.append(i['salary'])
info.append(i['secondType'])
info.append(i['workYear'])
info.append(i['positionAdvantage'])
info_list.append(info)
# time.sleep(2)
return info_list
def main():
lang_name = ['hadoop','大数据开发','java工程师','ETL','数据分析','BI工程师']
page = 1
url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false'
info_result = []
while page < 10:
info = get_data(url, page, lang_name)
info_result = info_result + info
page += 1
#写入lagou.txt文件中
with open('lagou.txt','w+',encoding='utf-8') as f:
for row in info_result:
f.write(str(row)+'\n')
if __name__ == '__main__':
main()
Python爬取拉勾网数据二二二保存数据至MySQL数据库
import requests import random import pymysql import time import json count = 0 url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Origin': 'https://www.lagou.com', 'Cookie': '_ga=GA1.2.1864083170.1542538584; user_trace_token=20181118185622-96224099-eb20-11e8-a648-525400f775ce; LGUID=20181118185622-96224871-eb20-11e8-a648-525400f775ce; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672676ed41274-09ce2d0583d289-3f674604-2073600-1672676ed42739%22%2C%22%24device_id%22%3A%221672676ed41274-09ce2d0583d289-3f674604-2073600-1672676ed42739%22%7D; index_location_city=%E6%B7%B1%E5%9C%B3; LG_LOGIN_USER_ID=03ba7c800122a93fc9a30c04c636a39968228a2acdbe0721; JSESSIONID=ABAAABAAAGFABEF38A87E81C9125A8B1F61050C1399F77A; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545899342,1546053937,1546054187,1546671741; _gat=1; _gid=GA1.2.1058771725.1546671741; LGSID=20190105150212-d3be7ffe-10b7-11e9-b0eb-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_navigation; LGRID=20190105150303-f1a7cbc3-10b7-11e9-b0eb-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1546671791; SEARCH_ID=64c9cca8fc2744558199fd8f7ed4ab21', 'Referer': 'https://www.lagou.com/jobs/list_?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' } db = pymysql.connect("192.168.94.150","root","123456","lagou") def add_mysql(id,companyId,companyFullName,positionId,positionName,workYear,education,salary,jobNature,city,companySize,companyShortName,firstType,secondType,thirdType,createTime,formatCreateTime,financeStage,industryField,positionAdvantage,companyLogo): try: cursor = db.cursor() sql = 'insert into data(id,companyId,companyFullName,positionId,positionName,workYear,education,salary,jobNature,city,companySize,companyShortName,firstType,secondType,thirdType,createTime,formatCreateTime,financeStage,industryField,positionAdvantage,companyLogo) ' \ 'values (%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:%s)' %(id,companyId,companyFullName,positionId,positionName,workYear,education,salary,jobNature,city,companySize,companyShortName,firstType,secondType,thirdType,createTime,formatCreateTime,financeStage,industryField,positionAdvantage,companyLogo); print(sql) cursor.execute(sql) print(cursor.lastrowid) db.commit() except Exception as e: print(e) db.rollback() def get_message(): for i in range(1,31): print('第'+str(i)+'页') time.sleep(random.randint(5,10)) data = { 'first': 'true', 'pn': i, 'kd': ['hadoop','大数据开发','java工程师','ETL','数据分析','BI工程师'], 'city': '全国' } reponse = requests.post(url=url,data=data,headers=headers) reponse.encoding = 'utf-8' results = json.loads(reponse.text) job_message = results['content']['positionResult']['result'] for job in job_message: global count count += 1 companyId = job['companyId'] companyFullName = job['companyFullName'] positionId = job['positionId'] positionName = job['positionName'] workYear = job['workYear'] education = job['education'] salary = job['salary'] jobNature = job['jobNature'] city = job['city'] companySize = job['companySize'] companyShortName = job['companyShortName'] firstType = job['firstType'] secondType = job['secondType'] thirdType = job['thirdType'] createTime = job['createTime'] formatCreateTime = job['formatCreateTime'] financeStage = job['financeStage'] industryField = job['industryField'] positionAdvantage= job['positionAdvantage'] companyLogo = job['companyLogo'] print(companyId) print(companyFullName) print(positionId) print(positionName) print(workYear) print(education) print(salary) print(jobNature) print(city) print(companySize) print(companyShortName) print(firstType) print(secondType) print(thirdType) print(createTime) print(formatCreateTime) print(financeStage) print(industryField) print(positionAdvantage+'\n\n') print(companyLogo) add_mysql(count,companyId,companyFullName,positionId,positionName,workYear,education,salary,jobNature,city, companySize,companyShortName,firstType,secondType,thirdType,createTime,formatCreateTime, financeStage,industryField,positionAdvantage,companyLogo) if __name__ == '__main__': get_message()