声明:此博客爬取的数据只为学习爬虫使用,绝非广告
文件目录
├── Zhipin_spider # 文件夹
│ ├── spider_main.py # 调度器。是爬虫的入口,管理各个类
│ ├── html_downloader.py # 下载器,负责网页内容的下载
│ ├── html_parser.py # 解析器,负责解析数据,获取有价值的数据
│ └── html_outputer.py # 输出,将获取到的数据输出
│
使用依赖
Requests
请求数据BeautifulSoup
解析数据xlsxwriter
框架保存数据到 Excel
文件中调度器 –> 网页下载器 –> 网页解析器 –> 输出
Requests
框架请求数据header
模拟浏览器访问try...except...
捕捉异常网页下载器核心代码
# -*- coding: utf-8 -*-
# @Time : 2018/1/7 上午11:46
# @Author : Mazy
# @File : html_downloader.py
# @Software: PyCharm
import requests
import ssl
class HtmlDownloader(object):
# 通过 url + 页码 + 关键词 获取数据
def get_page(self, baseUrl, page_num, keyword):
try:
#
param = {"query": keyword, "city": "101010100", "page": page_num}
# 设置请求头,模拟浏览器访问
header = {
'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
'Referer': r'http://www.zhipin.com/job_detail/',
'Connection': 'keep-alive'
}
result = requests.get(baseUrl, params=param, headers=header)
# print(result.text)
return result.text
except Exception as err:
print(err)
print("Boss直聘爬取失败")
return None
BeautifulSoup
提取数据网页解析器核心代码
# -*- coding: utf-8 -*-
# @Time : 2018/1/7 上午11:47
# @Author : Mazy
# @File : html_parser.py
# @Software: PyCharm
from bs4 import BeautifulSoup
class HtmlParser(object):
def parse(self, html_content):
if html_content is None:
return None
soup = BeautifulSoup(html_content, 'html.parser')
# 获取公司的列表
companies = soup.find('div', class_='job-list').select('ul > li')
results = []
for com in companies:
res = self.get_one_company(com)
results.append(res)
return results
def get_one_company(self, soup):
company_soup = soup.find('div', class_="info-company")
# company name
com_name = company_soup.find('a').text
# company desc
com_desc = company_soup.find('p').text
primary_soup = soup.find('div', class_="info-primary")
# job name
all = primary_soup.find('h3', class_="name").a.text
sala = primary_soup.find('h3', class_="name").a.span.text
job = all.replace(sala, "")
# salary
salary = primary_soup.find('h3', class_="name").a.span.text
# conpany require
job_require = primary_soup.find('p').text
return [com_name, job, salary, job_require, com_desc]
xlwt -> Workbook
框架将数据保存到 Excel
xlsxwriter
框架将数据保存到 Excel
输出核心代码
# -*- coding: utf-8 -*-
# @Time : 2018/1/7 上午11:47
# @Author : Mazy
# @File : html_outputer.py
# @Software: PyCharm
from xlwt import Workbook, Worksheet
import xlsxwriter
class HtmlOutputer(object):
# 使用 xlwt 将数据存到 Excel
def save_to_excel(self, results, tag_name, file_name):
book = Workbook(encoding="utf-8")
tmp = book.add_sheet('sheet')
times = len(results) + 1
for i in range(times):
if i==0 :
for tag_name_i in tag_name:
tmp.write(i, tag_name.index(tag_name_i), tag_name_i)
else:
for tag_list in range(len(tag_name)):
tmp.write(i, tag_list, str(results[i-1][tag_list]))
book.save(r'/Users/bai/Desktop/%s.xls' % file_name)
# 使用 xlsxwriter 将数据存到 Excel
def save_to_excel_other_way(self, results, tag_names, file_name):
book = xlsxwriter.Workbook(R'/Users/bai/Desktop/%s.xls' % file_name)
tmp = book.add_worksheet()
row_num = len(results)
for i in range(1, row_num+1):
if i == 1:
tag_pos = 'A%s' % i
tmp.write_row(tag_pos, tag_names)
else:
con_pos = 'A%s' % i
content = results[i-1]
tmp.write_row(con_pos, content)
book.close()
调度器核心代码
# -*- coding: utf-8 -*-
# @Time : 2018/1/7 上午11:47
# @Author : Mazy
# @File : spider_main.py
# @Software: PyCharm
from Zhipin_spider import html_downloader, html_parser, html_outputer
import time
import random
class BOSS_Main(object):
def __init__(self):
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.outputer = html_outputer.HtmlOutputer()
def start(self,keyword, baseURL, page_count):
all_coms = []
for i in range(1, page_count+1):
# 添加延迟
time.sleep(random.uniform(1, 5))
# 获取网页内容
content = self.downloader.get_page(baseURL, i, keyword)
# 解析数据
com_results = self.parser.parse(content)
print("正在抓取第 %d 页数据, 有 %d 条数据" % (i, len(com_results)))
# 保存数据到列表
for com in com_results:
all_coms.append(com)
print(len(all_coms))
print(all_coms)
tag_name = ['公司名称', '职位名称', '工资', '所需学历', '公司介绍']
# self.outputer.save_to_excel(all_coms, tag_name, "test")
# 输出
self.outputer.save_to_excel_other_way(all_coms, tag_name, "boss1")
# 程序的入口
if __name__ == "__main__":
keyword = input('请输入抓取的关键词:\n')
page_counts = input("请输入抓取总页数:\n")
baseURL = "http://www.zhipin.com/job_detail/"
bosszp = BOSS_Main()
bosszp.start(keyword, baseURL, int(page_counts))
Zhipin_spider