代码方面,我们使用scrapy框架爬取酒店信息,经过测试发现,使用这种方法不会被封ip和cookie。
思路:
1.得到城市的编号
2.通过编号,进入酒店列表,并且得到酒店总数
3.计算酒店页数,构造得到网址
4.爬取相关的信息
结果截图:
爬取字段:
‘id’, ‘酒店名称’, ‘地址’, ‘评分’, ‘入住总数’, ‘类型’,‘简称’, ‘维度’, ‘经度’,‘城市’
items.py
import scrapy
class XiechengItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
id=scrapy.Field()
name=scrapy.Field()
address=scrapy.Field()
score=scrapy.Field()
dpcount=scrapy.Field()
stardesc=scrapy.Field()
shortName=scrapy.Field()
lat=scrapy.Field()
lon=scrapy.Field()
cityname= scrapy.Field()
ur=scrapy.Field()
pip.py
import csv
class xiechengPipeline:
def __init__(self):
self.f = open("携程1.csv", "w", encoding='utf-8', newline='')
self.writer = csv.writer(self.f)
self.writer.writerow(['id', '酒店名称', '地址', '评分', '入住总数', '类型','简称', '维度', '经度','城市','ur'])
def process_item(self, item, spider):
wangyiyun_list = [item['id'], item['name'], item['address'], item['score'], item['dpcount'], item['stardesc'], item['shortName'],item['lat'], item['lon'],item['cityname'],item['ur']]
self.writer.writerow(wangyiyun_list)
return item
def close_spider(self, spider): # 关闭
# self.writer.close()
self.f.close()
srtting.py
# -*- coding: utf-8 -*-
# Scrapy settings for xiecheng project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'xiecheng'
SPIDER_MODULES = ['xiecheng.spiders']
NEWSPIDER_MODULE = 'xiecheng.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'xiecheng (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': '_abtest_userid=bfd4449b-a4d7-491f-87d7-bc028e90e9e9; magicid=IAe3ijFpJ8ucsWcD3sNFTUwhizZdbhzjq8xovved1fHBaLSQv4yIN4/TI76Mhhde; Session=SmartLinkCode=U155950&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=&SmartLinkLanguage=zh; MKT_CKID=1592121385283.k37f2.5icu; _RF1=39.154.6.74; _RDG=2864b61a49eaec23151b294f55815a5555; _RSG=maKAUmvhsl0ojPlOA.hLj8; _RGUID=0bf6af9c-1e29-4930-8b55-afe298bd9b72; _ga=GA1.2.1663292872.1592121385; _gid=GA1.2.128767798.1592121385; MKT_Pagesource=PC; hoteluuid=A0zFC5pm95q5CHbd; cticket=1E4DBB9D588113D2C8D7684B1BD0CCDE4BFA05776009B9149D8A3A2323704A40; AHeadUserInfo=VipGrade=0&VipGradeName=%C6%D5%CD%A8%BB%E1%D4%B1&UserName=&NoReadMessageCount=0; ticket_ctrip=bJ9RlCHVwlu1ZjyusRi+ypZ7X2r4+yojXN5UTMe2Bf3oN/rnijNRTmoexMeW91Q+G5EEUZ/ecxzWpF5YkCp0zCKtMN8XU54xOtkvtDz4bGxjCoS4eMeiH7ywFZKIgkgTz//dJhr/RO7tvwAu181GfhcDnp+mSDw7FPJXm2eGWyDxSh2W8oS2921LHW6V3UsYafzhKeyxBiXvX6tj6VFxjClEIIvPLu7VmJcOpUqoVb+3Rcl5vkfOiflmljgEsupcC0eh2aI54XdgGgF0ZzCq2+HqgMGrHjBOEF9F/qbqLGs6orvXuOglpg==; DUID=u=7BF800DA570704733E38CE002C6EA43FF71F3B9C37186641CF0E12A395AFFBFA&v=0; IsNonUser=u=7BF800DA570704733E38CE002C6EA43FF71F3B9C37186641CF0E12A395AFFBFA&v=0; UUID=2D45F656B950415A839DC749A3C0AE01; IsPersonalizedLogin=F; MKT_CKID_LMT=1592220338801; HotelDomesticVisitedHotels1=1776800=0,0,4.7,1707,/200g0k000000c9nij0312.jpg,&17017586=0,0,4.6,775,/20071c000001djxuqDCAC.jpg,&16081014=0,0,4.8,2759,/200q1h000001hmku9CB40.jpg,&5247299=0,0,4.8,347,/200f1800000152f1h6381.jpg,; GUID=09031104211964295094; MKT_OrderClick=ASID=4897155950CMOTstCdheoCFc1fvAod0xoDSQ8128964947961556345160662196927&AID=4897&CSID=155950&OUID=fpz&CT=1592272594362&CURL=https%3A%2F%2Fwww.ctrip.com%2F%3Fsid%3D155950%26allianceid%3D4897%26ouid%3Dfpz%26keywordid%3D160662196927%26bd_vid%3D8128964947961556345%26ds_rl%3D1284915%26gclid%3DCMOTstCdheoCFc1fvAod0xoDSQ%26gclsrc%3Dds&VAL={"pc_vid":"1592121382532.42e6q3"}; HotelCityID=1split%E5%8C%97%E4%BA%ACsplitBeijingsplit2020-6-16split2020-06-17split0; ASP.NET_SessionId=biofjkzdba2kojmpm0uqwrfm; hoteluuidkeys=Hq4wUtW3qeq1E9QrtY6Ya5YG6EqYkYs7eF8EpBj7TWpY7Yt0jPMI0UYd8jtYaYpbjs7w5LwpPj1YBYSTR1GYg0vO9jbY6YF5vTfYtMyOXjkGvLAeFnYgbjHByAYgYpsvSbvpmYh8wd3jlTeL9i0Ow6Y7YOY3YLsvT5edgYnNiDsY7YzYSYgYFXE3XKQawODi4BRtSjOrMOYN9JmTyBrABYz9WpTv0Xx5Ne0aYkFxn3xbFYnTi4AwZpjQFEzMJmhW4NjDrMzJfOiQXw6mvzlRT1jk3YGkjmrU7yG6iAlwPsRFZENcjpgx4ZxskEFNESpEdBWUZea7wnqEhXjSseTkigTYgZrH0efnel1xpcifGiBsxhoWaBjfAepSwOpKDswkciU7RMAj14eNkEUtyQfvMci76EzOy3kvF5Kk3E3hKc9wDli5fRN0jarBaYp5JzqyDrApjzHeaTjbgKgtjTgw5Ox85xhlxSbxfoEOZEBLEl8WU3ed5wHoEpAjTDeoUi05YXgr4LELpy5pv4biTdEPTyb8vHbKXlWS8ENHjOzeoGxh9jprBOEf0Wb0ecBjH6Yd6j40x6Fx5gx14xPhE1TE8OE3LW65e1BwoGEcZjmOeP5igUYz4rmNedXeFTYtDENowP4WzGihhK3QEOnEA8E4ZWUAemXwBcEgNj7betNi5gYlhrl3eo5eGzEclYHcEBhwTlW5miZYDYZHY1Xi8Ai7miohjMYtYP8wB7EP7EBSJ19j9fyH8jZDjhY6YhFRUqJFkvSQYLgYG1y7NYlswf9WmZRdpygtjSNWFYhYP3YG1WqprbsytUEd7JPkEkbRUBvDBJlLJ65JFBE0mv8zJt8ygYcY9PR5XJBkv3cY41RlSYGnRMmiaFj6aYUpJzPiT4YhYtYX6jMXwHfvBS; OID_ForOnlineHotel=159212138253242e6q31592272602112102032; appFloatCnt=11; _bfa=1.1592121382532.42e6q3.1.1592272591647.1592279932892.9.75.228032; _bfs=1.2; Union=OUID=fpz&AllianceID=4897&SID=155950&SourceID=&createtime=1592279940&Expires=1592884739964; _jzqco=%7C%7C%7C%7C1592220344063%7C1.443668655.1592121385276.1592279935860.1592279939995.1592279935860.1592279939995.undefined.0.0.31.31; __zpspc=9.9.1592279935.1592279940.2%232%7Cwww.baidu.com%7C%7C%7C%25E6%2590%25BA%25E7%25A8%258B%7C%23; _gcl_dc=GCL.1592279940.CMOTstCdheoCFc1fvAod0xoDSQ; _bfi=p1%3D100101991%26p2%3D100101991%26v1%3D75%26v2%3D74',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'xiecheng.middlewares.XiechengSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'xiecheng.middlewares.SeleniumMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'xiecheng.pipelines.xiechengPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HEADERS={
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-CN,zh;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'cache-control': 'max-age=0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
需要源码的朋友请在公众号中
**回复:携程源码**