scrapy同时运行2个及以上爬虫方法
每个爬虫创建对应的运行文件,然后运行每个文件。每个爬虫的数据模型要相同。
run1.py
# -*- coding:utf-8 -*-
from scrapy import cmdline
#c
cmdline.execute(['scrapy, crawl, 爬虫1'])
cmdline.execute('scrapy crawl 爬虫1'.split(' '))
run2.py
# -*- coding:utf-8 -*-
from scrapy import cmdline
#c
cmdline.execute(['scrapy, crawl, 爬虫2'])
cmdline.execute('scrapy crawl 爬虫2'.split(' '))
爬虫文件
数据清洗:负责清除数据两端的空格,空行,特殊符号等,
常用操作一般是strip,
包括清除无效数据,例如数据格式不完整的数据,
以及重复的数据
查找元素尽量用xpath定位,少用索引.因为有可能出现索引越界错误.
只有在不明确错误时使用异常捕获.
job.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import JobspiderItem
class JobSpider(scrapy.Spider):
name = 'job'
allowed_domains = ['51 job.com
']
#3个开始链接,分别为python,php,java职位信息
start_urls = [
' http://search.51job.com/list/010000%252C020000%252C030200%252C040000%252C180200
,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
' http://search.51job.com/list/010000%252C020000%252C030200%252C040000%252C180200
,000000,0000,00,9,99,php,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
' http://search.51job.com/list/010000%252C020000%252C030200%252C040000%252C180200
,000000,0000,00,9,99,html,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
]
def parse(self, response):
yield scrapy.Request(
url=response.url,
callback=self.parse_job_info,
meta={},
dont_filter=True
)
def parse_next_page(self, response):
"""
解析下一页
:param response:
:return:
"""
next_page = response.xpath("//li[@class='bk'][2]/a/@href").extract_first('')
if next_page:
yield scrapy.Request(
url=next_page,
callback=self.parse_job_info,
meta={},
dont_filter=True
)
"""
递归:如果一个函数内部自己调用自己
这种形式就叫做递归
"""
def parse_job_info(self, response):
"""
解析工作信息
:param response:
:return:
"""
job_div_list = response.xpath("//div[@id='resultList']/div[@class='el']")
for job_div in job_div_list:
job_name = job_div.xpath("p/span/a/@title").extract_first('无工作名称').strip().replace(",", "/")
job_company_name = job_div.xpath("span[@class='t2']/a/@title").extract_first('无公司名称').strip()
job_place = job_div.xpath("span[@class='t3']/text()").extract_first('无地点名称').strip()
job_salary = job_div.xpath("span[@class='t4']/text()").extract_first('面议').strip()
job_time = job_div.xpath("span[@class='t5']/text()").extract_first('无时间信息').strip()
job_type = '51job' if '51 job.com
' in response.url else '其它'
print(job_type, job_name, job_company_name, job_place, job_salary, job_time)
"""
数据清洗:负责清除数据两端的空格,空行,特殊符号等
常用操作一般是strip
包括清除无效数据,例如数据格式不完整的数据
以及重复的数据
"""
item = JobspiderItem()
item['job_name'] = job_name
item['job_company_name'] = job_company_name
item['job_place'] = job_place
item['job_salary'] = job_salary
item['job_time'] = job_time
item['job_type'] = job_type
item['fan_kui_lv'] = "没有反馈率"
yield item
yield scrapy.Request(
url=response.url,
callback=self.parse_next_page,
dont_filter=True,
)
zl.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import JobspiderItem
class ZlSpider(scrapy.Spider):
name = 'zl'
allowed_domains = [' zhaopin.com
']
start_urls = [
' http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC%2B%E4%B8%8A%E6%B5%B7%2B%E5%B9%BF%E5%B7%9E%2B%E6%B7%B1%E5%9C%B3%2B%E6%AD%A6%E6%B1%89&kw=python&sm=0&p=1
',
' http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC%2B%E4%B8%8A%E6%B5%B7%2B%E5%B9%BF%E5%B7%9E%2B%E6%B7%B1%E5%9C%B3%2B%E6%AD%A6%E6%B1%89&kw=php&sm=0&p=1
',
' http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC%2B%E4%B8%8A%E6%B5%B7%2B%E5%B9%BF%E5%B7%9E%2B%E6%B7%B1%E5%9C%B3%2B%E6%AD%A6%E6%B1%89&kw=html&sm=0&p=1
',
]
def parse(self, response):
yield scrapy.Request(
url=response.url,
callback=self.parse_job_info,
meta={},
dont_filter=True,
)
pass
def parse_job_info(self, response):
"""
解析工作信息
:param response:
:return:
"""
zl_table_list = response.xpath("//div[@id='newlist_list_content_table']/table[@class='newlist']")
for zl_table in zl_table_list[1:]:
#网页自动生成 运行起来看效果,或者右键查看源码
# zl_td_list = zl_table.xpath("tr[1]/td")
#问题:td数不是5个 报错 索引越界
# td1 = zl_td_list[0]
# td2 = zl_td_list[1]
# td3 = zl_td_list[2]
# td4 = zl_td_list[3]
# td5 = zl_td_list[4]
#查找元素尽量用xpath定位,少用索引.因为有可能出现索引越界错误.
#只有在不明确错误时使用异常捕获.
# //text() 获取标签内所有文本
#extract()把列表里的元素转换成文本.本身还是列表
#extract_first("默认值") 把列表里的元素转换成文本并取出第一个,如果取不到返回默认值
td1 = zl_table.xpath("tr/td[@class='zwmc']/div/a//text()").extract()
# 新的列表 = map(任意函数名,应用的列表) 用于对列表中的所有元素做处理
td1 = map(str.strip, td1)
job_name = "".join(td1).replace(",", "/")
fan_kui_lv = zl_table.xpath("tr/td[@class='fk_lv']/span/text()").extract_first("没有反馈率").strip()
job_company_name = zl_table.xpath("tr/td[@class='gsmc']/a[1]/text()").extract_first("没有公司名称").strip()
job_salary = zl_table.xpath("tr/td[@class='zwyx']/text()").extract_first("面议").strip()
job_place = zl_table.xpath("tr/td[@class='gzdd']/text()").extract_first("没有地址").strip()
print(job_name,fan_kui_lv,job_company_name,job_salary,job_place)
item = JobspiderItem()
item['job_name'] = job_name
item['job_company_name'] = job_company_name
item['job_place'] = job_place
item['job_salary'] = job_salary
item['job_time'] = "没有时间"
item['job_type'] = "智联招聘"
item['fan_kui_lv'] = "没有反馈率"
yield item
yield scrapy.Request(
url=response.url,
callback=self.parse_next_page,
meta={},
dont_filter=True,
)
def parse_next_page(self, response):
"""
解析下一页
:param response:
:return:
"""
next_page = response.xpath("//a[text()='下一页']/@href").extract_first("")
if next_page :
yield scrapy.Request(
url=next_page,
callback=self.parse_job_info,
meta={},
dont_filter=True,
)
item.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class JobspiderItem(scrapy.Item):
# define the fields for your item here like:
job_name = scrapy.Field()
job_company_name = scrapy.Field()
job_place = scrapy.Field()
job_salary = scrapy.Field()
job_time = scrapy.Field()
job_type = scrapy.Field()
fan_kui_lv = scrapy.Field()
middlewares.py
自己写个类
JobUserMiddleware加到该文件中。该类的作用是自动生成useragent,使用时需要在setting.py中DOWNLOADER_MIDDLEWARES配置,在设置时同时也要禁用系统自带的默认useragent。这是由于下面的类是参考python包site-packages/scrapy/downloadermiddlewares中的代码写的,如果不禁用系统的,默认使用系统的useragent。
class JobUserMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy',name=''):
self.user_agent = UserAgent()
@classmethod
def from_crawler(cls, crawler):
# o = cls(crawler.settings['USER_AGENT'],'张三')
# cls后的数据会自动赋值给构造函数的对应参数
o = cls()
# crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
# =右边代码的含义是从spider中获得user_agent的属性,
# 如果没有默认为self.user_agent的内容
# self.user_agent = getattr(spider, 'user_agent', self.user_agent)
pass
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent.random)
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#pipeline:俗称管道,用于接收爬虫返回的item数据.
class JobspiderPipeline(object):
def process_item(self, item, spider):
return item
class ToCsvPipeline(object):
def process_item(self, item, spider):
with open("job.csv", "a", encoding="gb18030") as f:
job_name = item['job_name']
job_company_name = item['job_company_name']
job_place = item['job_place']
job_salary = item['job_salary']
job_time = item['job_time']
job_type = item['job_type']
fan_kui_lv = item['fan_kui_lv']
job_info = [job_name, job_company_name, job_place, job_salary, job_time, job_type, fan_kui_lv, "\n"]
f.write(",".join(job_info))
#把item传递给下一个pipeline做处理
return item
settings.py
需要修改的有以下
#是否遵守robots协议,默认为Ture
ROBOTSTXT_OBEY = False
#延迟设置,防止速度过快被服务器检测到
DOWNLOAD_DELAY = 0.5
#禁用cookie追踪降低被发现几率
COOKIES_ENABLED = False
#启用middlewares
DOWNLOADER_MIDDLEWARES = {
'JobSpider.middlewares.JobUserMiddleware': 543,
#禁用系统useragent,None表示禁用,数字越小优先级越高
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None
}
#启用item
ITEM_PIPELINES = {
'JobSpider.pipelines.ToCsvPipeline': 300,
}