前程无忧的防爬措施还算比较适合入门案例,源码附上了,用到的技术点有requests.get发送请求
xlml.etree解析网页,selenium主要是获取页数,让程序可以实现岗位、页数自定义。当然使用selenium这个需要配置一下,可以参考这一篇博客,若是解决不了可以留言 最后希望这个这个案例能帮到你
(135条消息) 抓取花卉图片_AI-阿强的博客-CSDN博客
import requests
from lxml import etree
import json
import time
import pandas as pd
from selenium import webdriver
#公司名
company_list =[]
#职位
job_list = []
#工作地点
workarea_list = []
#福利
jobwelf_list = []
#工资
providesalaruy_list = []
#发布时间
updatedate_list = []
#链接
href_list = []
#学历与经验
attribute_list =[]
def number_pages():
# 创建一个浏览器
driver = webdriver.Chrome()
driver.get(
'https://search.51job.com/list/030200,000000,0000,00,9,99,{},2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99'.format(work_name))
driver.maximize_window()
# 获取总页数
pagecount = driver.find_element_by_xpath('//div[@class="j_page"]/div[@class="p_box"]/div[@class="p_wp"]/div[@class="p_in"]/span[@class="td"]').text
print('招聘信息'+ pagecount)
driver.quit()
def response(yeshu):
# url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99'.format(work_name,yeshu)
url = 'https://search.51job.com/list/030200,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99'.format(work_name,yeshu)
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41'
}
rp = requests.get(url=url,headers=header).text
# print(rp)
print('-'*100)
html = etree.HTML(rp)
xp = html.xpath('/html/body/script[2]/text()')[0]
jsondata = json.loads(str(xp).replace('window.__SEARCH_RESULT__ = ', ''))
work_data = jsondata['engine_search_result']
# print(work_data[1])
return work_data
#列表数据添加
def data_extraction(work):
for data in work:
if data['job_href'] =='':
href_list.append('null')
else:
href_list.append(data['job_href'])
if data['company_name'] == '':
company_list.append('null')
else:
company_list.append(data['company_name'])
if data['job_name'] == '':
job_list.append('null')
else:
job_list.append(data['job_name'])#职位
if data['workarea_text'] == '':
workarea_list.append('null')
else:
workarea_list.append(data['workarea_text'])
if data['jobwelf'] == '':
jobwelf_list.append('null')
else:
jobwelf_list.append(data['jobwelf'])
if data['providesalary_text'] == '':
providesalaruy_list.append('null')
else:
providesalaruy_list.append(data['providesalary_text'])
if data['updatedate'] == '':
updatedate_list.append('null')
else:
updatedate_list.append(data['updatedate'])
if data['attribute_text'] == '':
attribute_list.append('Null')
else:
attribute_list.append(data['attribute_text'])
def Loop():
for yeshu in range(st,end):
#获取数据
work = response(yeshu)
data_extraction(work)
#保存数据
keep_data()
print('爬取第{}页完成'.format(yeshu))
time.sleep(3)
def keep_data():
#创建表
df = pd.DataFrame()
df['公司名']=company_list
df['招聘职位']=job_list
df['福利']=jobwelf_list
df['工作地点']=workarea_list
df['工作经验']=attribute_list
df['工资']=providesalaruy_list
df['发布时间']=updatedate_list
df['链接'] = href_list
df.to_excel('广州{}招聘信息.xlsx'.format(work_name))
if __name__=='__main__':
print('————————————欢迎使用阿强爬虫——————————————')
work_name = input('请输入工作名:')
print('——————弹出招聘网页————————')
time.sleep(2)
number_pages()
st = input('请输入开始获取页面信息页数:')
st = int(st)
end = input('请输入结束获取页面信息页数:')
end = int(end)+1
#进入循环体
Loop()
print('————————爬取结束,欢迎使用阿强爬虫————————')
print('————————30秒后自动关闭————————')
time.sleep(30)