![178bc26d6a28e9f177010e9150d849f2.jpg](https://upload-images.jianshu.io/upload_images/9136378-068a8b1de5a0204f.jpg?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
#这里只分析主要的思路和要注意的点。有什么不懂的可以评论提问,研究给出的代码理解。
1.通过分析可以知道,拉钩的职位数据是ajax加载的。而且拉勾每个职位给用户看的只有30页,用户在不同浏览器搜索看到的数据是不一样的。但是它的服务器上面一个职位并不止30页,它是根据用户的搜索按照一定的规则展示30页出来,像百度一样。但是我们可以通过请求循环,把它服务器上所有的职位的数据请求下来。
![image.png](https://upload-images.jianshu.io/upload_images/9136378-e5f7f26d796b3a8b.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
2.接下来分析post请求
![image.png](https://upload-images.jianshu.io/upload_images/9136378-5bbacf3a4902ed34.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
3.接下来就是构造请求,获取数据,编写解析规则的事情了
![image.png](https://upload-images.jianshu.io/upload_images/9136378-8955835c987e1f55.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
4.可以通过构造一个请求,如输入page==1000来测出当最大页 数时候返回是什么数据,然后观察那些字段可以确定最大页数的,然后写个if判断,来确定这个职位数据已爬取完毕,然后退出程序,通过请求发现,当page==0时为职位的最大页数,具体看下面给出的代码
5.请阅读代码,不懂的地方可以发表评论提问
import requests
import csv
from retrying import retry
from json import JSONDecodeError
import json
class lagouspider():
def __init__(self):
self.header = {'Host': 'www.lagou.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'X-Anit-Forge-Token': 'None',
'X-Anit-Forge-Code': '0',
'Content-Length': '26',
'Cookie': '_ga=GA1.2.1015646365.1538989463; user_trace_token=20181008170425-277ca381-cad9-11e8-bb68-5254005c3644; LGUID=20181008170425-277caaa8-cad9-11e8-bb68-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAAGGABCB4878F933065B5D43F6FB0748D8A0E39E; LGSID=20181018091439-2f6bfe7d-d273-11e8-bda4-5254005c3644; PRE_UTM=; PRE_HOST=www.google.com; PRE_SITE=https%3A%2F%2Fwww.google.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; _gid=GA1.2.465338117.1539825277; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1538989463,1539825277,1539825286,1539825338; TG-TRACK-CODE=index_search; _gat=1; LGRID=20181018094311-2bef568a-d277-11e8-8058-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1539826990; SEARCH_ID=44a9a700439e406a80372da370820d72',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'}
self.start_url="https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
def get_post(self,url,data):
response=requests.post(url,headers=self.header,data=data)
return response.text
def get_data(self,kd,pn):
data={
"first": "false",
"pn": pn,
"kd": kd
}
return data
def json2dict(self,infos):
try:
items=json.loads(infos)
return items
except JSONDecodeError as e:
print(e)
@retry(stop_max_attempt_number=3)
def get_info(self,items,item):
try:
for info in items["content"]["positionResult"]["result"]:
item['职位'] = info['positionName']
item['职位发布时间'] = info['createTime']
# 把字典里面的东西提取出来,再加上一个名字
item['工作经验'] = info['workYear']
item['学历要求'] = info['education']
item['工资'] = info['salary']
item['职位诱惑'] = info['companyLabelList'] if len(info['companyLabelList']) > 0 else '空字段'
item['职位标签'] = info['positionLables'] if len(info['positionLables']) > 0 else '空字段'
item['公司名称'] = info['companyFullName']
yield item
except :
pass
def run(self,key):
num=1
item = {}
while 1:
try:
data = self.get_data(kd=key, pn=num)
infos = self.get_post(self.start_url, data=data)
items=self.json2dict(infos)
page=items["content"]["pageNo"]
print("*" * 100)
print("正在爬取第%d页"%page)
print("*"*100)
try:
for i in self.get_info(items,item):
with open(key+'拉钩网职位信息.csv', 'a', newline='') as f:
writer = csv.DictWriter(f, headers)
writer.writerow(i)
except PermissionError as e :
print(e)
except UnicodeEncodeError as f:
print(f)
num+=1
if page==0:
print("爬取结束")
break
except TypeError as m:
print(m)
if __name__ == '__main__':
headers = ['职位', '职位发布时间', '工作经验', '学历要求', '工资', '职位诱惑', '职位标签', '公司名称']
key=input("请输入需要要爬取职位的关键字:")
with open(key+'拉钩网职位信息.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, headers)
writer.writeheader()
spider=lagouspider()
spider.run(key)