import requests
from lxml import etree
def get_html(url):# 获得网页源代码
header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'}
try:
response=requests.get(url,headers=header)
if response.status_code==200:
return response.content.decode("gbk")
else:
print("响应状态码错误!")
except Exception as e:
print("请求出现错误,错误类型:%s"%e)
def parse_html(html):# 解析网页内容
html=etree.HTML(html)
# print(html)
# 获取职位名称
zwmc=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/p/span/a/text()')
# print(zwmc)
# 获取公司名称
gsmc=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/span[1]/a/text()')
# print(gsmc)
# 获取工作地点
gzdd=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/span[2]/text()')
# print(gzdd)
# 获取薪酬
xc=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/span[3]')
new_xc=[]
for x in xc:
new_xc.append(x.xpath('string(.)'))# map函数更方便
# print(new_xc)
# 获取发布时间
fbsj=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/span[4]/text()')
# print(fbsj)
tlist=[]
# 打印每个信息的数量
# print(len(zwmc),len(gsmc),len(gzdd),len(new_xc),len(fbsj))
for i in range(len(zwmc)):
if len(new_xc[i])!=0:
if new_xc[i][-3:]=='万/月':
low,high=new_xc[i][:-3].split('-')
low=float(low)*10000
high=float(high)*10000
elif new_xc[i][-3:]=='千/月':
low,high = new_xc[i][:-3].split('-')
low=float(low)*1000
high=float(high)*1000
elif new_xc[i][-3:]=='万/年':
low,high=new_xc[i][:-3].split('-')
low=float(low)*10000/12
high=float(high)*10000/12
elif new_xc[i][-3:]=='元/天':# 200元/天
low=high=float(new_xc[i][:-3])*23
elif new_xc[i][-4:]=='元/小时':# 30元/小时
low=high=float(new_xc[i][:-4])*8*23
else:
print("出现异常值:%s"%new_xc[i])
low=high='异常值'
else:
low=high=0
tlist.append([zwmc[i].strip(),gsmc[i],gzdd[i].split("-")[0],str(low),str(high),fbsj[i]])
return tlist
def save_to_csv(tlist):
with open("51job数据分析师.csv",'a',encoding='utf-8-sig')as f:
for line in tlist:
f.write(','.join(line))# 用逗号将序列中的元素拼接起来,返回字符串
f.write('\n')
def main():# 定义主函数
with open("51job数据分析师.csv",'w',encoding='utf-8-sig')as f:
f.write('职位名称,公司名称,公司地点,最低薪酬,最高薪酬,发布时间\n')
for i in range(1,101):
start_url='https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(i)
print("正在访问%s页"%i)
# 第一步:获取网页源代码
html=get_html(start_url)
# 第二步:解析网页
info=parse_html(html)
# 第三步:保存数据
save_to_csv(info)
if __name__ == '__main__':
main()# 调用主函数共爬去5000条数据分析师职位信息