分析:
IT桔子(https://www.itjuzi.com/)是一个用于提供数据的网站。
- 需要登录才能查看,并且20页之后的信息不能查看。
- 解决需要登录才能查看的问题,带上cookie访问。
参考:https://blog.csdn.net/sinat_35360663/article/details/78505129
首先写一个基于CrawlSpider类的scrapy爬虫,然后在其基础上修改为RedisCrawlSpider类的Scrapy-Redis分布式爬虫
1、cmd基础操作
C:\Users\Administrator\Desktop>scrapy startproject itjuzi C:\Users\Administrator\Desktop>cd itjuzi C:\Users\Administrator\Desktop\itjuzi>scrapy genspider -t crawl demo www.itjuzi.com
2、编辑items.py,设置需要保存的item数据
from scrapy.item import Field,Item class ItjuziItem(Item): info_id=Field() #公司id(url数字部分) company_name=Field() #公司名称 slogan=Field() #公司口号 scope=Field() #分类 #投资情况列表:包含获投时间、融资阶段、融资金额、投资公司 tz_info=Field() #团队信息列表:包含成员姓名、成员职称、成员介绍 tm_info=Field()
3、编辑settings.py 【因为使用Scrapy-Redis框架爬虫,所以要设置与redis相关的配置】
BOT_NAME = 'itjuzi' SPIDER_MODULES = ['itjuzi.spiders'] NEWSPIDER_MODULE = 'itjuzi.spiders' #调度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" #过滤器 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1.5 # 支持随机下载延迟 RANDOMIZE_DOWNLOAD_DELAY = True COOKIES_ENABLED = False DOWNLOADER_MIDDLEWARES = { # 该中间件将会收集失败的页面,并在爬虫完成后重新调度。(失败情况可能由于临时的问题,例如连接超时或者HTTP 500错误导致失败的页面) 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 80, # 该中间件提供了对request设置HTTP代理的支持。您可以通过在 Request 对象中设置 proxy 元数据来开启代理。 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100, #该中间件设置请求时的User-Agent 'itjuzi.middlewares.RotateUserAgentMiddleware': 200, } ITEM_PIPELINES = { 'itjuzi.pipelines.ItjuziPipeline': 300, } REDIS_HOST = "192.168.199.108" REDIS_PORT = 6379 #此为master端的IP和PORT
4、编辑middlewares.py 【编辑自己的中间键RotateUserAgentMiddleware,其为设置请求的user-agent】
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware import random class RotateUserAgentMiddleware(UserAgentMiddleware): def __init__(self,user_agent=''): self.user_agent=user_agent super(RotateUserAgentMiddleware,self).__init__() def process_request(self, request, spider): #用于随机选择user-agent ua=random.choice(self.user_agent_list) request.headers.setdefault('User-Agent',ua) user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10", "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12", "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14", ]
5、编辑demo.py爬虫文件 【因为之前是基于CrawlSpider类创建,所以此时将其改为基于RedisCrawlSpider类爬虫】
【注:使用BeautifulSoup库进行解析页面】
# -*- coding: utf-8 -*- import scrapy from bs4 import BeautifulSoup from scrapy.linkextractors import LinkExtractor #from scrapy.spiders import CrawlSpider, Rule from scrapy.spiders import Rule from scrapy_redis.spiders import RedisCrawlSpider from itjuzi.items import ItjuziItem class DemoSpider(RedisCrawlSpider): name = 'demo' allowed_domains = ['www.itjuzi.com'] #此时就不自动获取域名,直接设置域名 #start_urls = ['http://www.itjuzi.com/'] redis_key = 'demospider:start_urls' page_link=LinkExtractor(allow=('/company\?page=\d+')) #获取每一页的链接 company_info_link=LinkExtractor(allow=('/company/\d+')) #获取每一个公司的详情链接 rules = [ Rule(page_link), Rule(company_info_link, callback='parse_item', follow=False), ] def parse_item(self, response): soup=BeautifulSoup(response.body,'html.parser') cpy1 = soup.find('div', class_='infoheadrow-v2')# 开头部分: //div[@class="infoheadrow-v2 ugc-block-item"] company_name = cpy1.find(class_='title').b.contents[0].strip().replace('\t', '').replace('\n', '') slogan = cpy1.find(class_='info-line').p.get_text() scope = cpy1.find(class_='scope c-gray-aset').find_all('a')[0].get_text().strip() # 主体信息: main = soup.find('div', class_='main') # 投资情况://table[@class="list-round-v2 need2login"] # 投资情况,包含获投时间、融资阶段、融资金额、投资公司 tz = main.find('table', 'list-round-v2') tz_list = [] all_tr = tz.find_all('tr') for tr in all_tr: tz_dict = {} all_td = tr.find_all('td') tz_dict['tz_time'] = all_td[0].span.get_text().strip() tz_dict['tz_round'] = all_td[1].get_text().strip() tz_dict['tz_finades'] = all_td[2].get_text().strip() tz_dict['tz_capital'] = all_td[3].get_text().strip().replace('\n', ',') tz_list.append(tz_dict) # 团队信息:成员姓名、成员职称、成员介绍 tm = main.find('ul', class_='list-prodcase limited-itemnum') tm_list = [] for li in tm.find_all('li'): tm_dict = {} tm_dict['tm_m_name'] = li.find('span', class_='c').get_text().strip() tm_dict['tm_m_title'] = li.find('span', class_='c-gray').get_text().strip() tm_dict['tm_m_intro'] = li.find('p', class_='mart10 person-des').get_text().strip() tm_list.append(tm_dict) item = ItjuziItem() item['info_id'] = response.url.split('/')[-1:][0] item['company_name'] = company_name item['slogan'] = slogan item['scope'] = scope item['tz_info'] = tz_list item['tm_info'] = tm_list return item #此处直接return item,没有yield item,所以不需要设置pipelines.py文件
6、运行:
(1)Slaver端(师弟电脑): scrapy runspider demo.py
(2)Master端(我电脑):redis-cli > lpush demospider:start_urls http://www.itjuzi.com/company
【此时师弟电脑开始爬虫,我电脑的redis数据库也有数据!!!】