本次爬取51job网站所会用到的方法在前面三篇文章中已经说过了,有需要的可以看下:
《Python爬取求职网第一天》——浏览器自动化测试框架Selenium
《Python爬取求职网第二天》——requests库和BeautifulSoup库
《Python爬取求职网第三天》——反爬虫和汇总表格
话不多说,先上代码!
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import time
import re
#获取并解析网页源代码,根据标签查询目的文本
def getsave_infomation(browser,csv_writer):
soup = BeautifulSoup(browser.page_source, 'html.parser')
items = soup.find_all(class_='j_joblist')
# print(items) #网页源代码
for tag in items:
name = tag.find_all('span',class_='jname at')
date = tag.find_all('span',class_='time')
area = tag.find_all('span',class_='d at')
salary = tag.find_all('span',class_='sal')
for i,j,k,l in zip(name,date,area,salary):
name1 = i.get_text()
date1 = j.get_text()
area1 = k.get_text()
arealist=chai(area1)
k=len(arealist)
area2,experience,grade,num=get_area(arealist,k)
salary1 = l.get_text()
#将一行的内容添加进CSV文件
row = [name1,date1,salaryCleaning(salary1),area2,experience,grade,num]
csv_writer.writerow(row)
print(name1,date1,salaryCleaning(salary1),area2,experience,grade,num)
#打开和关闭浏览器,保存信息入CSV文件
def get_page():
browser = webdriver.Chrome()
with open('C++岗位分析.csv', 'w', newline='') as file:
csv_writer = csv.writer(file)
header = ['岗位', '发布时间', '薪水(K/月)','工作地区','工作经验','学历要求','招聘人数']
csv_writer.writerow(header)
csv_writer = csv.writer(file)
for k in range(1,50):
url = 'https://search.51job.com/list/020000,000000,0000,00,9,99,C%252B%252B,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(k)
time.sleep(2)
browser.get(url)
getsave_infomation(browser,csv_writer)
browser.quit()
#正则表达式拆分字符串
def chai(str):
arealist = []
for i in str:
if '|' in str:
newloc = re.compile('[| ]+')
arealist=newloc.split(str)
return arealist
#根据字符串长度,填入对应列
def get_area(arealist,k):
area2=''
experience=''
grade=''
num=''
for i in range(k):
if k==3 and '经验' in arealist:
if i==0:
area2=arealist[i]
if i==1:
experience=arealist[i]
if i==2:
num=arealist[i]
grade = ''
if k==3 and ('大专' in arealist or '本科' in arealist or '硕士' in arealist) :
if i==0:
area2=arealist[i]
if i==1:
grade=arealist[i]
if i==2:
num=arealist[i]
experience = ''
if k==4:
if i==0:
area2=arealist[i]
if i==1:
experience=arealist[i]
if i==2:
grade=arealist[i]
if i==3:
num=arealist[i]
return area2,experience,grade,num
#字段处理
def salaryCleaning(salary):
avgSalary = 0
for sa in salary:
if '-'in sa :
minSalary = re.findall(re.compile('(\d*\.?\d+)'),salary)[0]
maxSalary = re.findall(re.compile('(\d?\.?\d+)'),salary)[1]
if '万' in salary and '年' in salary:
minSalary = float(minSalary)/12*10
maxSalary = float(maxSalary)/12*10
elif '万' in salary and '月' in salary:
minSalary = float(minSalary)*10
maxSalary = float(maxSalary)*10
avgSalary = (float(minSalary)+float(maxSalary))/2
else :
minSalary = re.findall(re.compile('(\d*\.?\d+)'),salary)[0]
maxSalary=""
if u'万' in salary and u'年' in salary:
minSalary = float(minSalary)/12*10
elif u'万' in salary and u'月' in sa:
minSalary = float(minSalary)*10
elif u'元' in salary and u'天' in salary:
minSalary = float(minSalary)/1000*21
avgSalary = float(minSalary)
else :
minSalary = "";maxSalary = "";
return avgSalary
def main():
get_page()
main()
from selenium import webdriver #从Selenium导入webdriver(驱动)
from bs4 import BeautifulSoup #从bs4导入BeautifulSoup
import csv #采用csv保存方式
import time #调用睡眠时间函数
import re #正则表达式模块
如果想要获取网站每一页的数据,势必要实现页面跳转,但我们之前所学到的都是单一页面获取数据,对于页面的链接并不需要修改,我们应该怎么做呢?但其实网页链接都是有规律的,我们只要找到规律,困难就能迎刃破解了。
我们在51job求职网看看,每一页跳转时网页链接变化了什么,网页链接:51job求职网
我们能发现每一页跳转的时候,仅仅网页的一个参数出现了变化,其实不仅仅是这个网站,绝大多数网站相邻页面的链接都是只改变一个参数,数据分页的格式都是万变不离其宗,每页的数据条数固定页码和每页的条数都是通过参数传入,我们可以通过对比不同页码的链接参数对比得出规律。
既然如此,我们只用在网页链接里增加一个format()函数,再通过for循环依次打开每一个页面即可
代码↓
browser = webdriver.Chrome() # 选择 Chrome浏览器打开
for k in range(1,50):
url = 'https://search.51job.com/list/020000,000000,0000,00,9,99,C%252B%252B,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(k)
time.sleep(2) #睡眠时间2s
browser.get(url) #打开网页
browser.quit() #关闭浏览器
代码↓
def get_page():
browser = webdriver.Chrome() # 选择 Chrome浏览器打开
for k in range(1,50):
url = 'https://search.51job.com/list/020000,000000,0000,00,9,99,C%252B%252B,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(k)
time.sleep(2) #睡眠时间2s
browser.get(url) #打开网页
getsave_infomation(browser,csv_writer) #每次循环调用一次getsave_infomation()函数
browser.quit() #关闭浏览器
调用的getsave_infomation()函数中,第一个传入的参数(browser)是我们实例化的浏览器对象,我们将操作该对象来控制浏览器,第二个参数(csv_writer)是创建的CSV文件
我们查询C++的岗位为例,进到第一页后我们对C/C++研发工程师和C++软件工程师分别右键点击,点击“检查”
网页及源代码截图↓
我们发现所需的岗位名称,发布日期,薪水,以及工作地区都在div class="j_joblist"的标签下
而岗位名称,发布日期,薪水,以及工作地区分别在 class=‘jname at’,class=‘time’,class=‘sal’,class='d at’的span标签下
代码↓
def getsave_infomation(browser,csv_writer):
soup = BeautifulSoup(browser.page_source, 'html.parser')
items = soup.find_all(class_='j_joblist')
# print(items) #网页源代码
for tag in items:
name = tag.find_all('span',class_='jname at')
date = tag.find_all('span',class_='time')
area = tag.find_all('span',class_='d at')
salary = tag.find_all('span',class_='sal')
通过上面的代码,我们成功把四个所需的数据,分别存在了name,date,area,salary的列表中,接下来我们需要再用一个for循环,从这四个列表中,分别取出相对应的值并输入到CSV文件中。
代码↓
def getsave_infomation(browser,csv_writer):
soup = BeautifulSoup(browser.page_source, 'html.parser')
items = soup.find_all(class_='j_joblist')
# print(items) #网页源代码
for tag in items:
name = tag.find_all('span',class_='jname at')
date = tag.find_all('span',class_='time')
area = tag.find_all('span',class_='d at')
salary = tag.find_all('span',class_='sal')
for i,j,k,l in zip(name,date,area,salary):
name1 = i.get_text()
date1 = j.get_text()
area1 = k.get_text()
salary1 = l.get_text()
#将一行的内容添加进CSV文件
row = [name1,date1,salary1,area1]
csv_writer.writerow(row)
print(name1,date1,salary1,area1)
结果截图↓
至此我们爬取数据的大部分工作已经结束了,剩下的就是对薪水和工作地区等进行数据清洗,拆解成我们所需的格式。
这一块代码我就不多介绍了,其中会用到了正则表达式,格式转换等方法,不了解的同学可以自行学习。
数据清洗代码↓
#正则表达式拆分字符串
def chai(str):
arealist = []
for i in str:
if '|' in str:
newloc = re.compile('[| ]+')
arealist=newloc.split(str)
return arealist
#根据字符串长度,填入对应列
def get_area(arealist,k):
area2=''
experience=''
grade=''
num=''
for i in range(k):
if k==3 and '经验' in arealist:
if i==0:
area2=arealist[i]
if i==1:
experience=arealist[i]
if i==2:
num=arealist[i]
grade = ''
if k==3 and ('大专' in arealist or '本科' in arealist or '硕士' in arealist) :
if i==0:
area2=arealist[i]
if i==1:
grade=arealist[i]
if i==2:
num=arealist[i]
experience = ''
if k==4:
if i==0:
area2=arealist[i]
if i==1:
experience=arealist[i]
if i==2:
grade=arealist[i]
if i==3:
num=arealist[i]
return area2,experience,grade,num
#字段处理
def salaryCleaning(salary):
avgSalary = 0
for sa in salary:
if '-'in sa :
minSalary = re.findall(re.compile('(\d*\.?\d+)'),salary)[0]
maxSalary = re.findall(re.compile('(\d?\.?\d+)'),salary)[1]
if '万' in salary and '年' in salary:
minSalary = float(minSalary)/12*10
maxSalary = float(maxSalary)/12*10
elif '万' in salary and '月' in salary:
minSalary = float(minSalary)*10
maxSalary = float(maxSalary)*10
avgSalary = (float(minSalary)+float(maxSalary))/2
else :
minSalary = re.findall(re.compile('(\d*\.?\d+)'),salary)[0]
maxSalary=""
if u'万' in salary and u'年' in salary:
minSalary = float(minSalary)/12*10
elif u'万' in salary and u'月' in sa:
minSalary = float(minSalary)*10
elif u'元' in salary and u'天' in salary:
minSalary = float(minSalary)/1000*21
avgSalary = float(minSalary)
else :
minSalary = "";maxSalary = "";
return avgSalary
现在!爬取51job网站已经大功告成了!!
这四篇文章是我第一次所写,不可避免的会有很多瑕疵,如果发现哪里不对的,欢迎大家及时评论指正错误,我会在后续立马修改。之后我还会再出多一篇关于Python的文章,也是一个实例,有兴趣的同学可以留意一下。
如果这四篇文章对你们有帮助的话,可以点赞收藏下。
本次分享圆满结束了,谢谢大家!我们下次见!