【今日知图】
vi 定位
vi l.py +5 直接进入错误代码第5行
vi l.py + 直接定位最后一行
1.项目初始化2.提取数据 2.1 原理分析 2.2 数据抽取 2.3 自定义spider3.存储数据 3.1 修改settings.py 3.2 数据存储4.结果展示5.作者的话
创建项目
scrapy startproject weather
创建Spider
scrapy genspider CqtianqiSpider tianqi.com
'''
由于CqtianqiSpider这个名字在后面scrapy crawl CqtianqiSpider中,
CqtianqiSpider名字太长,将spider中的name改为CQtianqi,
然后命令变为:scrapy crawl CQtianqi
'''
这次目的是抽取重庆及盐湖区7日天气预报,具体源码情况如上图所示,截出的就是本次爬虫所需要定位的地方。
接下来,定义以下存储的数据!
date = 当日日期
week = 星期几
img = 当日天气图标
wind = 当日风况
weather = 当日天气
high_temperature = 当日最高温度
low_temperature = 当日最低温度
修改items.py
import scrapy
class WeatherItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
collection = 'weather'
date = scrapy.Field()
week = scrapy.Field()
img = scrapy.Field()
high_temperature = scrapy.Field()
low_temperature = scrapy.Field()
weather = scrapy.Field()
wind = scrapy.Field()
CQtianqi.py
# -*- coding: utf-8 -*-
import scrapy
from weather.items import WeatherItem
class CqtianqiSpider(scrapy.Spider):
name = 'CQtianqi'
allowed_domains = ['tianqi.com']
start_urls = []
citys = ['chongqing','yanhuqu']
for city in citys:
start_urls.append('http://' + 'www.tianqi.com/' + city + '/')
def parse(self, response):
'''
date = 当日日期
week = 星期几
img = 当日天气图标
wind = 当日风况
weather = 当日天气
high_temperature = 当日最高温度
low_temperature = 当日最低温度
:param response:
:return:
'''
# oneweek = response.xpath('//div[@class="day7"]')
item = WeatherItem()
date = response.xpath('//div[@class="day7"]//ul[@class="week"]//li//b/text()').extract()
week = response.xpath('//div[@class="day7"]//ul[@class="week"]//li//span/text()').extract()
base_url = 'http:'
img = response.xpath('//div[@class="day7"]//ul[@class="week"]//li//img/@src').extract()
imgs = []
for i in range(7):
img_i = img[i]
img_url = base_url + img_i
imgs.append(img_url)
print(date)
print(week)
print(imgs)
weather = response.xpath('//div[@class="day7"]//ul[@class="txt txt2"]//li/text()').extract()
print(weather)
high_temperature = response.xpath('//div[@class="day7"]//div[@class="zxt_shuju"]/ul//li/span/text()').extract()
low_temperature = response.xpath('//div[@class="day7"]//div[@class="zxt_shuju"]/ul//li/b/text()').extract()
print(high_temperature)
print(low_temperature)
wind = response.xpath('//div[@class="day7"]//ul[@class="txt"][1]//li/text()').extract()
print(wind)
item['date'] = date
item['week'] = week
item['img'] = imgs
item['weather'] = weather
item['wind'] = wind
item['high_temperature'] = high_temperature
item['low_temperature'] = low_temperature
yield item
# 这两行直接添加
MONGO_URI = 'localhost'
MONGO_DB = 'test'
# 以下直接修改
ITEM_PIPELINES = {
'weather.pipelines.WeatherPipeline': 300,
'weather.pipelines.W2json': 301,
'weather.pipelines.MongoPipeline': 302,
'weather.pipelines.W2mysql': 303,
}
ROBOTSTXT_OBEY = False
USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36'
修改pipelines.py
存储MongoDB
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
self.db[item.collection].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
存储Mysql
def process_item(self, item, spider):
'''
将爬取的信息保存到mysql
'''
connection = pymysql.connect(host='localhost', user='root', password='xxx', db='scrapydb',
charset='utf8mb4')
try:
with connection.cursor() as cursor:
for i in range(7):
sql = "insert into `weather`(`date`,`week`,`high_temperature`,`low_temperature`,`weather`,`wind`,`img`)values(%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(sql, (
item['date'][i], item['week'][i], item['high_temperature'][i], item['low_temperature'][i],
item['weather'][i],
item['wind'][i], item['img'][i]))
connection.commit()
# except pymysql.err.IntegrityError as e:
# print('重复数据,勿再次插入!')
finally:
connection.close()
return item
存储至txt
class WeatherPipeline(object):
def process_item(self, item, spider):
# 文件存在data目录下的weather.txt文件内
fiename = pathdir + '\\data\\weather.txt'
# 从内存以追加的方式打开文件,并写入对应的数据
with open(fiename, 'a', encoding='utf8') as f:
for i in range(7):
f.write('日期:' + item['date'][i] + '\n')
f.write('星期:' + item['week'][i] + '\n')
f.write('最高温度:' + item['high_temperature'][i] + '\n')
f.write('最低温度' + item['low_temperature'][i] + '\n')
f.write('天气:' + item['weather'][i] + '\n')
f.write('风况:' + item['wind'][i] + '\n')
f.write('-------------------------------------' + '\n')
return item
存储至json
class W2json(object):
def process_item(self, item, spider):
'''
讲爬取的信息保存到json
方便调用
'''
filename = pathdir + '\\data\\weather.json'
# 打开json文件,向里面以dumps的方式吸入数据
# 注意需要有一个参数ensure_ascii=False ,不然数据会直接为utf编码的方式存入比如:“/xe15”
with open(filename, 'a', encoding='utf8') as f:
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
f.write(line)
return item
运行
进入到weather根目录而不是weather下面的weather里面哦!!!
scrapy crawl CQtianq
数据存储至txt
这里只截了一部分数据,实际每个重复两次。
数据存储至json
这个不是重复,存储的是两个地区数据!
数据存储至MongoDB
这个不是重复,存储的是两个地区数据!
数据存储至MySql
这个不是重复,存储的是两个地区数据!
终端运行
最后,您如果觉得本公众号对您有帮助,欢迎您多多支持,转发,谢谢!更多内容,请关注本公众号爬虫系列!点击阅读原文,查看源代码,不要忘记star哈~~
我今天才知道,我之所以漂泊就是在向你靠近。
--《廊桥遗梦》