scrapy startproject 项目名称 》cd 项目目录 手动或者命令(scrapy genspider 爬虫名称 域名
)创建spider文件
tree结构图如下:
│ main.py
│ scrapy.cfg
│ __init__.py
│
├─zhilian
│ │ items.py
│ │ middlewares.py
│ │ MYmiddlewares.py
│ │ pipelines.py
│ │ settings.py
│ │ __init__.py
│ │
│ ├─spiders
│ │ │ zhilianzhaopin.py
│ │ │ __init__.py
│ │ │
│ │ └─__pycache__
│ │ zhilianzhaopin.cpython-36.pyc
│ │ __init__.cpython-36.pyc
│ │
│ └─__pycache__
│ items.cpython-36.pyc
│ MYmiddlewares.cpython-36.pyc
│ pipelines.cpython-36.pyc
│ settings.cpython-36.pyc
│ __init__.cpython-36.pyc
│
└─__pycache__
__init__.cpython-36.pyc
import scrapy import jsonpath import json import requests from urllib import parse from zhilian.items import ZhilianItem from lxml import etree class Zhilian(scrapy.Spider): name = 'zhilian' allowed_domains = ['zhaopin.com'] start_urls=['https://www.zhaopin.com/'] #默认全国 base_url='https://fe-api.zhaopin.com/c/i/sou?start=%d&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&{}&kt=3 ' # base_url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=60&cityId=%d&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%s&kt=3' custom_settings = { 'DOWNLOADER_MIDDLEWARES': { # 'zhilian.MYmiddlewares.MiddleAgent': 800, 'zhilian.MYmiddlewares.RandomProxyMysql': 900, }, 'COOKIES_ENABLED':False, # 'RETRY_TIMES': 2, # 下载器重试次数 # 'DOWNLOAD_TIMEOUT':3 # 3秒以后请求超时 } headers = { "Accept": "application/json, text/plain, */*", "Accept-Language": "zh-CN,zh;q=0.9", "Host": " fe-api.zhaopin.com", "Origin": " https://sou.zhaopin.com", "Referer": " https://sou.zhaopin.com/?kw=Java%E5%BC%80%E5%8F%91&jl=489&", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", "Cookie":"adfbid2=0; _jzqa=1.779382411042367400.1530518801.1530518801.1530518801.1; _jzqy=1.1530518801.1530518801.1.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98.-; sts_deviceid=1645a088bea4aa-007ca90908f653-444a022e-1440000-1645a088beb327; campusOperateJobUserInfo=d9d69bc3-ce60-4a6e-9ecb-113ce0efab90; zg_did=%7B%22did%22%3A%20%221645a09b07c5f7-09e8cc190016e5-444a022e-15f900-1645a09b07d347%22%7D; ZP_OLD_FLAG=false; __xsptplus30=30.3.1530519350.1530519350.1%231%7Cother%7Ccnt%7C121113803%7C%7C%23%23rd9x5AceBDmDAF408t-VLcB57sdbGjOo%23; urlfrom2=121126445; adfcid2=none; LastCity=%E5%85%A8%E5%9B%BD; LastCity%5Fid=489; dywez=95841923.1530598259.5.2.dywecsr=sou.zhaopin.com|dyweccn=(referral)|dywecmd=referral|dywectr=undefined|dywecct=/; __utmz=269921210.1530598259.5.2.utmcsr=sou.zhaopin.com|utmccn=(referral)|utmcmd=referral|utmcct=/; urlfrom=121126445; adfcid=none; adfbid=0; dywec=95841923; sts_sg=1; __utmc=269921210; dywea=95841923.3939452883772670000.1530518798.1530632108.1530634535.8; __utma=269921210.1361494379.1530518800.1530632108.1530634535.8; sts_sid=16460ed82513b7-0cb69607cd02ee-444a022e-1440000-16460ed82523e7; __utmt=1; zg_08c5bcee6e9a4c0594a5d34b79b9622a=%7B%22sid%22%3A%201530635395576%2C%22updated%22%3A%201530635434073%2C%22info%22%3A%201530518941872%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fxiaoyuan.zhaopin.com%2F%22%7D; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCZ478053280J00059255513.htm; dyweb=95841923.6.10.1530634535; __utmb=269921210.6.10.1530634535; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1530518800,1530518811,1530519350,1530635448; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1530635448; ZL_REPORT_GLOBAL={%22sou%22:{%22actionIdFromSou%22:%22c3a9c233-0074-42ea-a976-9c82a8802d14-sou%22%2C%22funczone%22:%22smart_matching%22}}; GUID=52e3369c691f4672a808374ad5d77426; sts_evtseq=9", } #只爬IT的 def parse(self, response): data = response.xpath('//ol[@class="zp-jobNavigater-list"]/li[1]//div[@class="zp-jobNavigater-pop-list"]/a/text()').extract() for i in data: qs = { 'kw': i, } qs = parse.urlencode(qs) url_1 = self.base_url % 0 baseurl = url_1.format(qs) # print(baseurl) response = requests.get(baseurl) data = response.text data = json.loads(data) res = jsonpath.jsonpath(data,'$..numFound')[0] for i in range(0,res,60): url_2 = self.base_url % i fullurl = url_2.format(qs) print(fullurl) yield scrapy.Request(fullurl, callback=self.parse_page,headers=self.headers) def parse_page(self,response): data = response.text data = json.loads(data) result = jsonpath.jsonpath(data, '$..results[*]') for i in result: print(i) item = ZhilianItem() company_job = jsonpath.jsonpath(i,'$..jobName')[0] company = jsonpath.jsonpath(i,'$..company.name')[0] money = jsonpath.jsonpath(i,'$..salary')[0] adress = jsonpath.jsonpath(i,'$..city.display')[0] date_time = jsonpath.jsonpath(i,'$..createDate')[0] tag_list = jsonpath.jsonpath(i,'$..welfare')[0] tag_list= ','.join(tag_list) point_1 = jsonpath.jsonpath(i,'$..workingExp.name')[0] point_2 = jsonpath.jsonpath(i,'$..eduLevel.name')[0] point = point_1 +','+ point_2 pos_url = jsonpath.jsonpath(i, '$..positionURL')[0] # print(company_job) # print(company) # print(money) # print(adress) # print(date_time) # print(tag_list) # print(point) # print(pos_url) item['spidername'] = 'zhilianzhaopin' item['company_job'] = company_job item['company'] = company item['money'] = money item['adress'] = adress item['date_time'] = date_time item['tag_list'] = tag_list item['point'] = point item['url'] = pos_url yield scrapy.Request(pos_url, callback=self.parse_detail,meta={'item':item}) def parse_detail(self,response): item = response.meta['item'] info = response.css('div.tab-inner-cont p::text').extract() info = ','.join(info).strip().strip(',').strip() item['info'] = info yield item
1、通过实验可以看到,数据并没有在源码中,需要我们通过Fidder抓取实际数据传输的网址(这个过程需要我们认真分析,把URl的参数查找完全例如:页码参数、关键字参数),从而得到可以获取所有的json数据的url.(上面的url我只是用起始页和关键字进行变量,其他参数都是固定的,爬取一部分数据)
2、进行jsonpath解析数据,得到自己想到的数据,并得到详情页的url,用parse_detail处理
3、进入详情页面需要数据,许多数据并不在分析的标签元素例如:收入money,详情;(需要自己查看源码,得到准确的位置和标签,利用css选择器等方法,进行提取),注意:能在json数据提取的数据尽量在json数据提取,避免了很多工作量。
1、自己定义抬头,自己重写中间件;
2、构建一个代理池,随机选择一个;
3、settings文件的相关配置。
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from fake_useragent import UserAgent import random,base64 from scrapy.conf import settings import pymysql import random #定义一个中间件就是定义一个类 #随机选择抬头 class MiddleAgent(object): cookie = [ { "Cookie": "adfbid2=0; _jzqa=1.779382411042367400.1530518801.1530518801.1530518801.1; _jzqy=1.1530518801.1530518801.1.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98.-; sts_deviceid=1645a088bea4aa-007ca90908f653-444a022e-1440000-1645a088beb327; campusOperateJobUserInfo=d9d69bc3-ce60-4a6e-9ecb-113ce0efab90; zg_did=%7B%22did%22%3A%20%221645a09b07c5f7-09e8cc190016e5-444a022e-15f900-1645a09b07d347%22%7D; ZP_OLD_FLAG=false; __xsptplus30=30.3.1530519350.1530519350.1%231%7Cother%7Ccnt%7C121113803%7C%7C%23%23rd9x5AceBDmDAF408t-VLcB57sdbGjOo%23; urlfrom2=121126445; adfcid2=none; LastCity=%E5%85%A8%E5%9B%BD; LastCity%5Fid=489; dywez=95841923.1530598259.5.2.dywecsr=sou.zhaopin.com|dyweccn=(referral)|dywecmd=referral|dywectr=undefined|dywecct=/; __utmz=269921210.1530598259.5.2.utmcsr=sou.zhaopin.com|utmccn=(referral)|utmcmd=referral|utmcct=/; urlfrom=121126445; adfcid=none; adfbid=0; dywec=95841923; sts_sg=1; __utmc=269921210; dywea=95841923.3939452883772670000.1530518798.1530632108.1530634535.8; __utma=269921210.1361494379.1530518800.1530632108.1530634535.8; sts_sid=16460ed82513b7-0cb69607cd02ee-444a022e-1440000-16460ed82523e7; __utmt=1; zg_08c5bcee6e9a4c0594a5d34b79b9622a=%7B%22sid%22%3A%201530635395576%2C%22updated%22%3A%201530635434073%2C%22info%22%3A%201530518941872%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fxiaoyuan.zhaopin.com%2F%22%7D; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCZ478053280J00059255513.htm; dyweb=95841923.6.10.1530634535; __utmb=269921210.6.10.1530634535; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1530518800,1530518811,1530519350,1530635448; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1530635448; ZL_REPORT_GLOBAL={%22sou%22:{%22actionIdFromSou%22:%22c3a9c233-0074-42ea-a976-9c82a8802d14-sou%22%2C%22funczone%22:%22smart_matching%22}}; GUID=52e3369c691f4672a808374ad5d77426; sts_evtseq=9"}, { "Cookie": "adfbid2=0; _jzqa=1.779382411042367400.1530518801.1530518801.1530518801.1; _jzqy=1.1530518801.1530518801.1.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98.-; sts_deviceid=1645a088bea4aa-007ca90908f653-444a022e-1440000-1645a088beb327; campusOperateJobUserInfo=d9d69bc3-ce60-4a6e-9ecb-113ce0efab90; zg_did=%7B%22did%22%3A%20%221645a09b07c5f7-09e8cc190016e5-444a022e-15f900-1645a09b07d347%22%7D; ZP_OLD_FLAG=false; __xsptplus30=30.3.1530519350.1530519350.1%231%7Cother%7Ccnt%7C121113803%7C%7C%23%23rd9x5AceBDmDAF408t-VLcB57sdbGjOo%23; urlfrom2=121126445; adfcid2=none; LastCity=%E5%85%A8%E5%9B%BD; LastCity%5Fid=489; dywez=95841923.1530598259.5.2.dywecsr=sou.zhaopin.com|dyweccn=(referral)|dywecmd=referral|dywectr=undefined|dywecct=/; __utmz=269921210.1530598259.5.2.utmcsr=sou.zhaopin.com|utmccn=(referral)|utmcmd=referral|utmcct=/; urlfrom=121126445; adfcid=none; adfbid=0; dywec=95841923; sts_sg=1; __utmc=269921210; dywea=95841923.3939452883772670000.1530518798.1530632108.1530634535.8; __utma=269921210.1361494379.1530518800.1530632108.1530634535.8; sts_sid=16460ed82513b7-0cb69607cd02ee-444a022e-1440000-16460ed82523e7; zg_08c5bcee6e9a4c0594a5d34b79b9622a=%7B%22sid%22%3A%201530635395576%2C%22updated%22%3A%201530635434073%2C%22info%22%3A%201530518941872%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fxiaoyuan.zhaopin.com%2F%22%7D; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCZ478053280J00059255513.htm; dyweb=95841923.6.10.1530634535; __utmb=269921210.6.10.1530634535; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1530518800,1530518811,1530519350,1530635448; GUID=0bdb4e3bfb2845bc834bd9ef6b9341af; ZL_REPORT_GLOBAL={%22sou%22:{%22actionIdFromSou%22:%22a6e039d1-dce0-48c9-bf18-47123bd339f8-sou%22%2C%22funczone%22:%22smart_matching%22}}; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1530635458; sts_evtseq=10"}, { "cookie": "adfbid2=0; _jzqa=1.779382411042367400.1530518801.1530518801.1530518801.1; _jzqy=1.1530518801.1530518801.1.jzqsr=baidu|jzqct=%E6%99%BA%E8%81%94%E6%8B%9B%E8%81%98.-; sts_deviceid=1645a088bea4aa-007ca90908f653-444a022e-1440000-1645a088beb327; campusOperateJobUserInfo=d9d69bc3-ce60-4a6e-9ecb-113ce0efab90; zg_did=%7B%22did%22%3A%20%221645a09b07c5f7-09e8cc190016e5-444a022e-15f900-1645a09b07d347%22%7D; ZP_OLD_FLAG=false; __xsptplus30=30.3.1530519350.1530519350.1%231%7Cother%7Ccnt%7C121113803%7C%7C%23%23rd9x5AceBDmDAF408t-VLcB57sdbGjOo%23; urlfrom2=121126445; adfcid2=none; LastCity=%E5%85%A8%E5%9B%BD; LastCity%5Fid=489; dywez=95841923.1530598259.5.2.dywecsr=sou.zhaopin.com|dyweccn=(referral)|dywecmd=referral|dywectr=undefined|dywecct=/; __utmz=269921210.1530598259.5.2.utmcsr=sou.zhaopin.com|utmccn=(referral)|utmcmd=referral|utmcct=/; BLACKSTRIP=yes; urlfrom=121126445; adfcid=none; adfbid=0; dywec=95841923; sts_sg=1; Hm_lvt_80e552e101e24fe607597e5f45c8d2a2=1530519653,1530519725,1530598259,1530632108; __utmc=269921210; dywea=95841923.3939452883772670000.1530518798.1530632108.1530634535.8; __utma=269921210.1361494379.1530518800.1530632108.1530634535.8; sts_sid=16460ed82513b7-0cb69607cd02ee-444a022e-1440000-16460ed82523e7; Hm_lpvt_80e552e101e24fe607597e5f45c8d2a2=1530635319; stayTimeCookie=0; referrerUrl=https%3A//jobs.zhaopin.com/CZ478053280J00059255513.htm; zg_08c5bcee6e9a4c0594a5d34b79b9622a=%7B%22sid%22%3A%201530635395576%2C%22updated%22%3A%201530635434073%2C%22info%22%3A%201530518941872%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fxiaoyuan.zhaopin.com%2F%22%7D; zp_src_url=https%3A%2F%2Fjobs.zhaopin.com%2FCZ478053280J00059255513.htm; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1530518800,1530518811,1530519350,1530635448; GUID=0bdb4e3bfb2845bc834bd9ef6b9341af; ZL_REPORT_GLOBAL={%22sou%22:{%22actionIdFromSou%22:%22a6e039d1-dce0-48c9-bf18-47123bd339f8-sou%22%2C%22funczone%22:%22smart_matching%22}}; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1530635458; sts_evtseq=11; dyweb=95841923.7.10.1530634535; __utmt=1; __utmb=269921210.7.10.1530634535"} ] def __init__(self): self.ua = UserAgent() self.cookie = random.choice(self.cookie) def process_request(self,request,spider): request.headers['User-Agent'] = self.ua.random request.headers['User-Agent'] = self.cookie # 随机代理中间件,混合免费代理和认证代理 (数据库版) class RandomProxyMysql(object): def __init__(self): my = settings['MYSQL'] self.conn = pymysql.connect(my['host'], my['user'], my['password'], my['db'], charset='utf8') self.cursor = self.conn.cursor() # 发起请求前执行 def process_request(self,request,spider): # print(settings['PROXIES']) # print(random.choice(settings['PROXIES'])) # 获取代理 proxy = self.random_proxy() print(proxy) # 设置代理信息 request.meta['proxy'] = 'http://%s:%s' % (proxy[1],proxy[2]) # 响应以后执行 def process_response(self,request ,response,spider): print(response.status) return response def random_proxy(self): sql = 'select * from py09_proxy ORDER BY rand() limit 1' self.cursor.execute(sql) proxy = self.cursor.fetchone() return proxy
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class ZhilianPipeline(object): def process_item(self, item, spider): return item #数据写入库 class MysqlPipeline(object): def open_spider(self, spider): # 开启数据库连接 print('进入数据库') self.conn = pymysql.connect('127.0.0.1', 'root', '123456', 'temp', charset='utf8') self.cursor = self.conn.cursor() def process_item(self, item, spider): # 存储数据 try: sql, data = item.get_sql() self.cursor.execute(sql, data) self.conn.commit() print('文件写入') except Exception as e: print('------',e) return item def close_spider(self,spider): self.cursor.close() self.conn.close()
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class ZhilianItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class ZhilianItem(scrapy.Item): spidername = scrapy.Field() company_job = scrapy.Field() company = scrapy.Field() money = scrapy.Field() adress = scrapy.Field() date_time = scrapy.Field() tag_list = scrapy.Field() point = scrapy.Field() url = scrapy.Field() info = scrapy.Field() def get_sql(self): sql = 'insert into py09_zhilian(spidername,company_job,company ,money ,adress ,date_time ,tag_list,point ,url,info) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' data = (self['spidername'],self['company_job'],self['company'],self['money'],self['adress'],self['date_time'],self['tag_list'],self['point'],self['url'],self['info']) return sql,data
# -*- coding: utf-8 -*- # Scrapy settings for zhilian project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'zhilian' SPIDER_MODULES = ['zhilian.spiders'] NEWSPIDER_MODULE = 'zhilian.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'zhilian (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 1 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'zhilian.middlewares.ZhilianSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'zhilian.middlewares.ZhilianDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'zhilian.pipelines.ZhilianPipeline': 300, 'zhilian.pipelines.MysqlPipeline': 200, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' MYSQL = { 'host' : '127.0.0.1', 'user' : 'root', 'password' : '123456', 'db' : 'temp' }
爬取网页,不要急于入手写代码,重点是分析网页,写代码的逻辑比较常规,但是网页的结构和数据是多变。