1.创建项目tencentSpider
scrapy startproject tencentSpider
2.进入项目下创建爬虫
scrapy genspider -t crawl tencent hr.tencent.com
3.编辑items.py 写入提取字段模块类
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TencentspiderItem(scrapy.Item):
# define the fields for your item here like:
#职位
positionname = scrapy.Field()
#职位链接
positionlink = scrapy.Field()
#职位类型
positionType = scrapy.Field()
#需要人数
peopleNum = scrapy.Field()
#工作地点
worklocation = scrapy.Field()
#发布时间
publishTime = scrapy.Field()
3.书写管道文件对数据的处理
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class TencentspiderPipeline(object):
#创建并打开json文件
def __init__(self):
# 读取时需要加上encoding='utf-8' 不然显示乱码
self.filename=open("tencent.json","w",encoding='utf-8')
# process_item方法是必须写的 用来处理item数据
def process_item(self, item, spider):
text=json.dumps(dict(item),ensure_ascii=False)+"\n"
self.filename.write(text)
return item
# close_spider方法是可选的,结束时调用的方法
def close_spider(self,spider):
self.filename.close()
4.配置文件修改 修改两处
#请求默认报头
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
#管道文件 下载好数据处理
ITEM_PIPELINES = {
'mySpider.pipelines.ItcastPipeline': 300,
}
(如果趴下来显示在界面的数据太快想保存到本地)也可以添加
#保存日志信息的文件名
LOG_FILE="tencentlog.log"
#保存日志等级,高于或等于此等级的信息都被保存
LOG_LEVEL="DEBUG"
5.写入爬虫
# -*- coding: utf-8 -*-
import scrapy
#导入CrawlSpider 类和Rule
from scrapy.linkextractors import LinkExtractor
#导入连接规则匹配类,用来提取符合规则的链接
from scrapy.spiders import CrawlSpider, Rule
from ..items import TencentspiderItem
class TencentSpider(CrawlSpider):
name = 'tencent'
allowed_domains = ['hr.tencent.com']
start_urls = ['https://hr.tencent.com/position.php?start=0#a']
#匹Response里连接的提取规则 返回复合匹配规则的连接匹配对象
pagelink=LinkExtractor(allow=("start=\d+"))
#获取列表里的链接,一次发送请求,并且继续跟进,调用指定回调函数
rules = [
Rule(pagelink, callback='parseTencent', follow=True),
]
#调用函数
def parseTencent(self, response):
for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
item=TencentspiderItem()
item['sitionname'] = each.xpath("./td[1]/a/text()").extract_first()
# 详情
item['positionlink'] = each.xpath("./td[1]/a/@href").extract_first()
# 职位类型
item['positionType'] = each.xpath("./td[2]/text()").extract_first()
# 招聘人数
item['peopleNum'] = each.xpath("./td[3]/text()").extract_first()
# 工作地点
item['worklocation'] = each.xpath("./td[4]/text()").extract_first()
# 发布时间
item['publishTime'] = each.xpath("./td[5]/text()").extract_first()
yield item
6.执行
scrapy crawl tencent
7.出现tencent.json文件 即使所需要的内容