1.分析爬虫要采集的url地址,分析采集的数据字段
url地址
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=1",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=2",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=3",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=4",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=5", "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=6",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=7",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=8",
数据字段
招聘岗位:job
发布公司:company
薪水待遇:salary
2.定义采集的字段封装的Item类型,在items.py模块中,定义Item类
通过scrapy项目中的items.py模块定义封装采集数据字段的类型
为了能使用scrapy提供的各种内置功能,让定义的类型继承自scrapy.Item类型;类型中的字段属性通过scrapy.Field()进行定义!
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MyspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class ZhilianItem(scrapy.Item):
'''
自定义封装智联招聘的item类型,用于封装采集到的智联网站的数据
'''
# 定义属性字段
job = scrapy.Field()
company = scrapy.Field()
salary = scrapy.Field()
class NeihanItem(scrapy.Item):
'''
自定义封装内涵段子的Item类型,用与封装采集到的内涵段子网站的数据
'''
# 自定义属性字段
content = scrapy.Field()
3.在spiders/zhilianspider.py中开发爬虫程序,采集初步数据
zhilianspider.py
# coding:utf-8
# 引入需要的scrapy模块
import scrapy
from .. import items
class ZhilianSpider(scrapy.Spider):
'''
智联招聘数据采集爬虫程序
'''
# 定义爬虫的名称,用于在命令中调用
name = "zlspider"
# 定义域名限制
allowed_domains = ['zhaopin.com']
# 定义初始url地址
start_urls = (
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=1",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=2",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=3",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=4",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=5",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=6",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=7",
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E7%88%AC%E8%99%AB&sm=0&sg=cab76822e6044ff4b4b1a907661851f9&p=8",
)
def parse(self, response):
'''
采集数据之后,自动执行的函数,主要进行如下功能:
数据筛选->封装item对象->传递数据给Pipelines
模拟_保存数据到文件
:param response: 采集到的数据
:return:
'''
# print(response)
# filename = response.url.split("&")[-1] + ".html"
# with open(filename, "w") as f:
# f.write(response.body)
# job_items = []
job_list = response.xpath("//div[@id='newlist_list_content_table']/table[position()>1]//tr[1]")
for select in job_list:
job = select.xpath("td[@class='zwmc']/div/a/text()").extract()[0]
company = select.xpath("td[@class='gsmc']//a/text()").extract()[0]
salary = select.xpath("td[@class='zwyx']/text()").extract()[0]
# 封装成item对象
item = items.ZhilianItem()
item['job'] = job
item['company'] = company
item['salary'] = salary
# 将本次生成的item对象交给pipeline进行处理
yield item
# job_items.append(item)
# 可以用于直接接提取数据生成文件
# scrapy crawl zlspider -o job.csv 保存为excel文档
# return job_items
4.核心:在pipelines.py模块中,定义处理Item数据的piplelines,将数据存储到数据库中给其他项目做数据准备
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 导入数据库引擎对象
from sqlalchemy import create_engine
# 导入会话构建对象
from sqlalchemy.orm import sessionmaker
# 替换mysqldb模块
import pymysql
pymysql.install_as_MySQLdb()
class MyspiderPipeline(object):
def process_item(self, item, spider):
return item
class ZhilianPipeline(object):
'''
处理智联招聘数据的pipeline,负责最终的数据验证和数据存储
'''
def __init__(self):
'''
初始化对象数据:可以用于初始化资源
如:打开文件、打开数据库连接等等操作
'''
self.engine = create_engine("mysql://root:root@localhost/hw_0116_spider?charset=utf8")
Session = sessionmaker(bind=self.engine)
self.session = Session()
def open_spider(self, spider):
'''
爬虫开启时需要调用的函数,经常用于数据初始化
:param spider:
:return:
'''
pass
def close_spider(self, spider):
'''
爬虫程序关闭时自动调用的函数
经常用于做一些资源回收的工作,如:关闭和数据库的会话连接
:param spider:
:return:
'''
self.session.close()
def process_item(self, item, spider):
pass
'''
该函数会在爬虫采集并封装好的Item对象时自动调用
函数中针对item数据进行验证和存储
:param item:
:param spider:
:return:
'''
# 定义sql语句
sql = "insert into job(job,company,salary) values('%s','%s','%s')" % (item['job'], item['company'], item['salary'])
# 执行sql语句
self.session.execute(sql)
# 提交数据
self.session.commit()
在settings.py文件中进行如下操作
解开如下代码的注释(67行左右),并做出如下改动
ITEM_PIPELINES = {
# 'myspider.pipelines.MyspiderPipeline': 300,
'myspider.pipelines.ZhilianPipeline': 300,
}
5.运行
至此,简单爬虫项目已经完成,在当前项目的根目录cmd命令窗口输入如下命令:
scrapy crawl zlspider
注意:命令中的zlspider就是在zhilianspider.py文件中定义的 name = "zlspider"
保存到文件,通过如下命令可保存数据到指定的excel文件中
scrapy crawl zlspider -o job.csv
支持的文件格式有'xml', 'jsonlines', 'jl', 'json', 'csv', 'pickle', 'marshal'
6.扩展深度爬虫(给一个初始页面,按页码自动爬取所有页面中的数据
# coding:utf-8
"""
智联招聘 2
"""
# 引入需要的模块
import scrapy
from ..items import ZhilianItem
class ZhilianSpider(scrapy.Spider):
'''
智联招聘爬虫程序
'''
# 定义爬虫程序的名称
name = 'zl2'
# 定义可以访问的路由
allowed_domains = ['zhaopin.com']
# 定义访问的初始路径
start_urls = [
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC%2b%E4%B8%8A%E6%B5%B7&kw=django&isadv=0&sg=f13fbf9d2c2b4208bdc4bafd8ddda7dc&p=1"
]
def parse(self, response):
'''
爬虫程序响应函数
:param response:
:return:
'''
url = response.urljoin(self.start_urls[0])
yield scrapy.Request(url, callback=self.parse_response)
def parse_response(self, response):
# 筛选得到工作列表
job_list = response.xpath("//div[@id='newlist_list_content_table']/table[position()>1]/tr[1]")
# 循环得筛选工作信息
for job_msg in job_list:
job = job_msg.xpath("td[@class='zwmc']/div/a").xpath("string(.)").extract()[0]
company = job_msg.xpath("td[@class='gsmc']/a").xpath("string(.)").extract()[0]
salary = job_msg.xpath("td[@class='zwyx']").xpath("string(.)").extract()[0]
# 封装成item对象
item = ZhilianItem()
item['job'] = job
item['company'] = company
item['salary'] = salary
yield item
next_page = response.xpath("//div[@class='pagesDown']/ul/li/a/@href").extract()
# 再次循环处理请求
for page in next_page:
page = response.urljoin(page)
yield scrapy.Request(page, callback=self.parse_response)