苍穹之下的PM2.5数据采集——scrapy.defultspider

本文以最简单的形式使用Scrapy,抓取PM2.5数据,然后将数据存储进MongoDB或TXT文件
下面是非常简单的代码,试着自己写写,如果写不出,建议还需要回去扎实自己的基本功

spider code

# -*- coding: utf-8 -*-
import scrapy
from pm25.items import Pm25Item
import re

class InfospSpider(scrapy.Spider):
    name = "infosp"
    allowed_domains = ["pm25.com"]
    start_urls = ['http://www.pm25.com/rank/1day.html', ]
    custom_settings = {'ITEM_PIPELINES':{
             'pm25.pipelines.MongodbPipeline': 30,    #Pipelines开关
             # 'pm25.pipelines.TxtPipeline': 50,
                                            }
                         }
   
   def parse(self, response):
      item = Pm25Item()
      re_time = re.compile("\d+-\d+-\d+")
      date = response.xpath("/html/body/div[4]/div/div/div[2]/span").extract()[0]
      #单独解析出DATE
        selector = response.selector.xpath("/html/body/div[5]/div/div[3]/ul[2]/li")
        #从response里确立解析范围
        for subselector in selector:
        #通过范围逐条解析
            try: 
            #防止[0]报错
                rank = subselector.xpath("span[1]/text()").extract()[0]
                quality = subselector.xpath("span/em/text()")[0].extract()
                city = subselector.xpath("a/text()").extract()[0]
                province = subselector.xpath("span[3]/text()").extract()[0]
                aqi = subselector.xpath("span[4]/text()").extract()[0]
                pm25 = subselector.xpath("span[5]/text()").extract()[0]
            except IndexError:
                print(rank,quality,city,province,aqi,pm25)

            item['date'] = re_time.findall(date)[0]
            item['rank'] = rank
            item['quality'] = quality
            item['province'] = city
            item['city'] = province
            item['aqi'] = aqi
            item['pm25'] = pm25
            yield item

items code

# -*- coding: utf-8 -*-
import scrapy
class Pm25Item(scrapy.Item):
#最常规的写法
    date = scrapy.Field()
    rank = scrapy.Field()
    quality = scrapy.Field()
    province = scrapy.Field()
    city = scrapy.Field()
    aqi = scrapy.Field()
    pm25 = scrapy.Field()
    pass

pipelins code

import time
from scrapy.conf import settings
import pymongo

class TxtPipeline(object):
#将数据写txt文件
    def process_item(self, item, spider):
        today = time.strftime("%y%m%d",time.localtime())
        fname = str(today) + ".txt"
        with open(fname,"a",encoding="utf-8") as f:
            f.write(item["date"] +"," +
                    item["rank"] +"," +
                    item["quality"] +"," +
                    item["province"] +"," +
                    item["city"] +"," +
                    item["aqi"] +"," +
                    item["pm25"] +
                    "\n"
                    )
            f.close()
        return item

class MongodbPipeline(object):
#将数据写入MongoDB
#以下链接参数写在settings中
    def __init__(self):
        client = pymongo.MongoClient(settings["MONGODB_SERVER"],
                                     settings["MONGODB_PORT"]
                                     )
        db = client[settings["MONGODB_DB"]]
        self.coll = db[settings["MONGODB_COLLECTION"]]
    def process_item(self, item, spider):
        self.coll.insert(dict(item))
        return item
苍穹之下的PM2.5数据采集——scrapy.defultspider_第1张图片

你可能感兴趣的:(苍穹之下的PM2.5数据采集——scrapy.defultspider)