可以参考网友分享,注意这里我们是保存数据到数据库,还需要配置好数据库Mysql环境,在python里还要下载三方库pymysql
import scrapy
class QunarSpider(scrapy.Spider):
name = "Qunar"
allowed_domains = ["travel.qunar.com"]
start_urls = ["http://travel.qunar.com/"]
def parse(self, response):
pass
打开此文件进行编辑,设置我们需要爬取的数据项
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class QunarItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Title=scrapy.Field()
TravelLink=scrapy.Field()
Date=scrapy.Field()
Days=scrapy.Field()
Photo_Nums=scrapy.Field()
Free=scrapy.Field()
People=scrapy.Field()
Places=scrapy.Field()
Views=scrapy.Field()
Love=scrapy.Field()
Trip=scrapy.Field()
接着对主文件进行编辑,这里代码里使用try与except,由于页面的数据不是完全一样,有些会定位不到,然后会报错,我这里就略过不全数据
import scrapy
from qunar.items import QunarItem
class QunarSpider(scrapy.Spider):
name = "Qunar"
allowed_domains = ["travel.qunar.com"]
start_urls = ["http://travel.qunar.com/"]
def start_requests(self):
# 整理所需要页面的url
for i in range(1,101):
url='https://travel.qunar.com/travelbook/list.htm?page={}&order=hot_heat'.format(i)
print("爬取链接:",url)
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
# 保存数据
item=QunarItem()
# 定位每一页列表
# print(response.status_code)
text=response.xpath('//li[@class="list_item "]')
# print("返回结果:",text)
for list_item in text:
try:
item['Title']=list_item.xpath('./h2[@class="tit"]/a/text()').extract()[0]
item['TravelLink']=list_item.xpath('./h2[@class="tit"]/a/@href').extract()[0]
# /html/body/div[3]/div/div[2]/ul/li[6]/p[1]/span[1]/span[2]
item['Places']=list_item.xpath('.//p[@class="places"]/text()').extract()[0]
item['Date']=list_item.xpath('./p[@class="user_info"]/span[@class="intro"]/span[@class="date"]/text()').extract()[0]
item['Days']=list_item.xpath('./p[@class="user_info"]/span[@class="intro"]/span[@class="days"]/text()').extract()[0]
item['Photo_Nums']=list_item.xpath('./p[@class="user_info"]/span[@class="intro"]/span[@class="photo_nums"]/text()').extract()[0]
item['Free']=list_item.xpath('./p[@class="user_info"]/span[@class="intro"]/span[@class="fee"]/text()').extract()[0]
item['People']=list_item.xpath('./p[@class="user_info"]/span[@class="intro"]/span[@class="people"]/text()').extract()[0]
item['Views']=list_item.xpath('./p[@class="user_info"]/span[@class="nums"]/span[@class="icon_view"]/span/text()').extract()[0]
item['Trip']=list_item.xpath('./p[@class="user_info"]/span[@class="intro"]/span[@class="trip"]/text()').extract()[0]
item['Love']=list_item.xpath('./p[@class="user_info"]/span[@class="nums"]/span[@class="icon_love"]/span/text()').extract()[0]
except:
print("出错了")
continue
yield item
对于已经返回的数据进行保存到数据库,我们可以开通管道,开通管道,在settings.py中添加参数:
还有设置的参数小,就优先进行执行,这里我是直接在python中添加数据表,然后在表里写入数据,最后保存,记得输入自己数据库连接密码
代码如下:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class QunarPipeline:
def process_item(self, item, spider):
return item
class QunarMysqlPipeline:
def process_item(self,item,spider):
sql_insert="insert into qunar values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
self.cur.execute(sql_insert,[item['Title'],item['TravelLink'],item['Date'],item['Days'],\
item['Photo_Nums'],item['Free'],item['People'],item['Places'],item['Views'],item['Trip']])
self.db.commit()
print("++++++++")
return item
def open_spider(self,spider):
self.db=pymysql.connect(host='localhost',user='root',password='你的密码',database='stumis',charset='utf8mb4')
self.cur=self.db.cursor()
create_table="create table qunar(标题 varchar(255),旅游链接 varchar(255),日期 varchar(255),\
天数 varchar(255),照片数量 varchar(255),人均消费 varchar(255),人数 varchar(255),地点 varchar(255),\
阅览人数 varchar(255),Trip varchar(255))"
self.cur.execute(create_table)
def close_spider(self,spider):
self.db.close()
self.cur.close()
print("保存完成")
注意,我们需要改:,将True改为False,
接着添加headers参数,user-agent:
在爬取的过程中可能会出一些小bug,我们可以一起探讨