本文声明:仅为了学习爬虫编写,请勿商业或恶意攻击网站,解释权归作者所有
由于智联招聘网的信息是动态加载的,所有对于新手比较难爬,重点在于找准确请求url
注意:前三个字段是同一个请求,address字段是详情页
实现效果:mysql
实现效果:json文件
#执行文件
# -*- coding: utf-8 -*-
import scrapy
import json
from zlzp.items import ZlzpItem
# 下面这个类urlencode,用于get路由拼接 a=1&b=2&c=3
from urllib.parse import urlencode
class ZlspiderSpider(scrapy.Spider):
name = 'zlspider'
# allowed_domains = ['sou.zhaopin.com']
start_urls = ['https://fe-api.zhaopin.com/c/i/sou?']
def start_requests(self):
print("成伟无敌最俊朗。。。")
for i in range(0,1001,90):
data = {
"start":i,
"pageSize":90,
"cityId":538,
"kw":"python",
"kt":3
}
for url in self.start_urls:
new_url = url + urlencode(data)
yield scrapy.Request(url=new_url,callback=self.parse)
def parse(self, response):
data = json.loads(response.text)
res1 = data["data"]
res2 = res1["results"]
for i in res2:
# 每个公司都实例化一个item
item = ZlzpItem()
item["position"] = i["jobName"]
item["name"] = i["company"]["name"]
item["salary"] = i["salary"]
# 找到详情页的连接
positionURL = i["positionURL"]
yield scrapy.Request(url=positionURL,callback=self.detail,meta={"item":item})
# 详情页
def detail(self,response):
item = response.meta["item"]
address = response.xpath('//div[@class="job-address"]/div/span/text()').get()
item["address"] = address
# print(item)
# 提交数据到管道
yield item
items.py文件
import scrapy
class ZlzpItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
position = scrapy.Field()
name = scrapy.Field()
salary = scrapy.Field()
address = scrapy.Field()
pass
settings.py文件
USER_AGENT= "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 开启管道
ITEM_PIPELINES = {
'zlzp.pipelines.ZlzpPipeline': 300,
'zlzp.pipelines.ZlzpJsonPipeline': 301,
}
pipelines.py管道文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import json
import codecs
import os
# 保存到数据库
class ZlzpPipeline(object):
conn = None
mycursor = None
# 打开数据库
def open_spider(self,spider):
print("数据库连接中。。。")
# 连接数据库
self.conn = pymysql.connect(host='localhost',user='root',password='root123',db="scrapy1",port=3306)
# 获取游标
self.mycursor = self.conn.cursor()
# 执行爬虫
def process_item(self, item, spider):
position = item["position"]
name = item["name"]
salary = item["salary"]
address = item["address"]
# 插入记录
sql = "insert into zhilian values (null,'%s','%s','%s','%s')"%(position,name,salary,address)
# 执行
self.mycursor.execute(sql)
# 提交 执行的SQL才能生效
self.conn.commit()
print("正在插入%s的数据..."%name)
return item
# 关闭爬虫
def close_spider(self,spider):
print("断开数据库连接。。。")
# 关闭游标
self.mycursor.close()
# 关闭数据库
self.conn.close()
# 保存到json文件
class ZlzpJsonPipeline(object):
f = None
# 开始爬虫
def open_spider(self,spider):
# 如果不使用codecs.open打开文件,则close_spider里面的语句不生效,就是一个编码和解码的工具
self.f = codecs.open("zlzp.json","w",encoding="utf-8")
# 写入下面的字符串
self.f.write('"list":[')
# 执行爬虫
def process_item(self,item,spider):
# 想存储json文件,就得把item对象转变为字典对象
res = dict(item)
# 再把字典对象转换为json数据
# 这是因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False:
# 直接写入字典会保存,所以把字典形式的作为list列表的值字符串格式写入
str = json.dumps(res,ensure_ascii=False)
# 把数据写入json文件
self.f.write(str + "," + "\n")
return item
# 关闭爬虫
def close_spider(self,spider):
# SEEK_END 移动游标到文件最后,再向前偏移2个字符
self.f.seek(-2,os.SEEK_END)
# 移除偏移后的所有字符 移除了逗号,和一个换行符\n
self.f.truncate()
# 完成列表尾部
self.f.write("]")
# 关闭文件
self.f.close()