python文件打开方式详解——a、a+、r+、w+区别
Python模块学习 - openpyxl
1.信息获取,所需工具:拉勾网、Python3。 原来课程地址:python拉勾网爬虫
反爬:伪造浏览器请求
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50
(KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1'
本质是http请求,json也是请求,找到正确url(小心post,get请求)
多观察,对获取数据检查是否符合要求。
编码问题: 小心写入方式(追加,覆盖)
with open ('lagou.json','ab+') as fp:
fp.write(line.encode('utf-8'))
哈哈,发现大家挺喜欢的:上代码
getdata.py
#导入需要的库
import requests #请求网页
from bs4 import BeautifulSoup #解析
import json
import time
import numpy as np
import pandas as pd
import openpyxl
headers={
'Cookie':'_ga=GA1.2.2046537735.1519346482; user_trace_token=20180223084123-462157d6-1832-11e8-8df7-525400f775ce; LGUID=20180223084123-46215dc1-1832-11e8-8df7-525400f775ce; LG_LOGIN_USER_ID=a74eb645299f49ec2b1f0f98d8f27071b23ad1b8c3e4a22f; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E4%B8%8A%E6%B5%B7; WEBTJ-ID=20180417084813-162d112cf2b4d9-0300e63ea642e6-4545092c-2073600-162d112cf2c306; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523407416,1523407491,1523926094; _gid=GA1.2.1279679635.1523926094; LGSID=20180417084813-0203736a-41d9-11e8-b8a4-5254005c3644; PRE_UTM=m_cf_cpt_sogou_pc; PRE_HOST=www.sogou.com; PRE_SITE=https%3A%2F%2Fwww.sogou.com%2Fsogou%3Fquery%3D%25C0%25AD%25B9%25B4%26_asf%3Dwww.sogou.com%26_ast%3D1523926090%26w%3D01019900%26p%3D40040100%26pid%3Dsogou-site-c02d0450cdd75ce7%26sut%3D1050%26sst0%3D1523926089994%26lkt%3D0%252C0%252C0; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_sogou_pc; _putrc=6EA73BBBB51DEF7E; JSESSIONID=ABAAABAAADEAAFI11A5943D5AD8FA9FDAB240DFAD660213; login=true; unick=%E5%86%AF%E7%AB%B9%E5%90%9B; hasDeliver=134; gate_login_token=cc1c59fd3fc91706eb01534899470d38000e54a63b0db428; TG-TRACK-CODE=index_search; LGRID=20180417084823-0861f794-41d9-11e8-88dc-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523926106; SEARCH_ID=6be8a0349e5e4993a98387489af5cc6c',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
'Origin':'https://www.lagou.com',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':'None',
'X-Requested-With':'XMLHttpRequest',
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1'
} #模拟头部信息
positions=[] #职位信息
def main():
for x in range(1,3): #爬取几页
#构造post请求参数
data={
'first':'true',
'pn':x,
'kd':'数据分析'
}
result=requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E6%88%90%E9%83%BD&needAddtionalResult=false',headers=headers,data=data) #请求网页
# print(result.content)
json_result=result.json()
# print(json_result)
# print(type(json_result)),json转为字典
page_position=json_result['content']['positionResult']['result']
print(page_position)
# # #放里面,保存为json格式,追加方式写入
line=json.dumps(page_position,ensure_ascii=False)
# print(line),放for循环外边就不用追加了
with open ('lagou.json','ab+') as fp:
fp.write(line.encode('utf-8'))
positions.extend(page_position) #列表添加列表,所有数据以列表形式存在列表里面
#时间改大,少请求
time.sleep(3) #睡3秒钟
#print(positions)
# 放外面,最后列表追加数据到excel,哈哈
# positions=pd.DataFrame(data=positions)
# positions.to_excel('拉勾网数据分析职位.xlsx',index=False)
if __name__ == '__main__': #主函数入口
main()
get_datail.py
#导入需要的库
import requests #请求网页
from bs4 import BeautifulSoup #解析
import json
import time
import numpy as np
import pandas as pd
import openpyxl
headers={
'Cookie':'_ga=GA1.2.2046537735.1519346482; user_trace_token=20180223084123-462157d6-1832-11e8-8df7-525400f775ce; LGUID=20180223084123-46215dc1-1832-11e8-8df7-525400f775ce; LG_LOGIN_USER_ID=a74eb645299f49ec2b1f0f98d8f27071b23ad1b8c3e4a22f; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E4%B8%8A%E6%B5%B7; WEBTJ-ID=20180417084813-162d112cf2b4d9-0300e63ea642e6-4545092c-2073600-162d112cf2c306; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523407416,1523407491,1523926094; _gid=GA1.2.1279679635.1523926094; LGSID=20180417084813-0203736a-41d9-11e8-b8a4-5254005c3644; PRE_UTM=m_cf_cpt_sogou_pc; PRE_HOST=www.sogou.com; PRE_SITE=https%3A%2F%2Fwww.sogou.com%2Fsogou%3Fquery%3D%25C0%25AD%25B9%25B4%26_asf%3Dwww.sogou.com%26_ast%3D1523926090%26w%3D01019900%26p%3D40040100%26pid%3Dsogou-site-c02d0450cdd75ce7%26sut%3D1050%26sst0%3D1523926089994%26lkt%3D0%252C0%252C0; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_sogou_pc; _putrc=6EA73BBBB51DEF7E; JSESSIONID=ABAAABAAADEAAFI11A5943D5AD8FA9FDAB240DFAD660213; login=true; unick=%E5%86%AF%E7%AB%B9%E5%90%9B; hasDeliver=134; gate_login_token=cc1c59fd3fc91706eb01534899470d38000e54a63b0db428; TG-TRACK-CODE=index_search; LGRID=20180417084823-0861f794-41d9-11e8-88dc-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523926106; SEARCH_ID=6be8a0349e5e4993a98387489af5cc6c',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
'Origin':'https://www.lagou.com',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':'None',
'X-Requested-With':'XMLHttpRequest',
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1'
} #模拟头部信息
positions=[] #职位信息
def page_detail(id):
url='https://www.lagou.com/jobs/%s.html' %id
headers={
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city=%E6%88%90%E9%83%BD',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
req=requests.get(url,headers=headers)
soup=BeautifulSoup(req.content,'lxml')
googs=soup.select('#job_detail > dd.job-advantage > p')
woekerjob=soup.select('#job_detail > dd.job_bt > div')
if googs[0]=='':
print(None)
else:
print(googs[0].get_text())
print( woekerjob[0].get_text())
time.sleep(1)
def main():
for x in range(1,2): #爬取几页
#构造post请求参数
data={
'first':'true',
'pn':x,
'kd':'数据分析'
}
result=requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E6%88%90%E9%83%BD&needAddtionalResult=false',headers=headers,data=data) #请求网页
json_result=result.json()
page_position=json_result['content']['positionResult']['result']
for position in page_position:
position_dict={
'position_name': position["positionName"],
'position_salary': position[ "salary"],
'position_year': position["workYear"],
'position_companyname' : position["companyShortName"],
}
position_number=position["positionId"]
print(position_number)
page_detail(position_number)
# print(page_position)
# # #放里面,保存为json格式,追加方式写入
positions.extend(page_position) #列表添加列表,所有数据以列表形式存在列表里面
#时间改大,少请求
time.sleep(3) #睡3秒钟
#print(positions)
if __name__ == '__main__': #主函数入口
main()
其中getdetail里面获取数据为空时处理不好,总是IndexError: list index out of range,这个让我接着学习再说。
总结:发现对py的字典不熟悉,json搞混,在就是if a==b容易搞成a=b,以后还得多加练习。