【爬虫】2.6 实践项目——爬取天气预报数据

1. 项目简介


在中国天气网(http://www.weather.com.cn)中输入一个城市的名称,例如输入深圳,那么会转到地址http://www.weather.com.cn/weather1d/101280601.shtml的网页显示深圳的天气预报,其中101280601是深圳的代码,每个城市或者地区都有一个代码。如下图:

【爬虫】2.6 实践项目——爬取天气预报数据_第1张图片
【爬虫】2.6 实践项目——爬取天气预报数据_第2张图片

在上图中可以看到,深圳今天,7天,8-15天等的天气数据,这里爬取7天的天气预报数据。

2. HTML 代码分析


分析这段代码:

【爬虫】2.6 实践项目——爬取天气预报数据_第3张图片

7天的天气预报实际上在一个

    元素中,每天是一个M
  • 元素,7天的结构差不多是一样的(注意:今天没有最高温度与最低温度)

    3. 爬取天气预报数据


    from bs4 import BeautifulSoup
    from bs4.dammit import UnicodeDammit  # BS内置库,用于推测文档编码
    import urllib.request  # 发起请求,获取响应
    
    url = "http://www.weather.com.cn/weather/101280601.shtml"
    
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78"}
        req = urllib.request.Request(url, headers=headers)  # 创建请求对象
        data = urllib.request.urlopen(req)  # 发起请求
        data = data.read()  # 获得响应体
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup  # 解码
        soup = BeautifulSoup(data, "lxml")
        lis = soup.select("ul[class='t clearfix'] li")
        x = 0
        for li in lis:
            try:
                date = li.select('h1')[0].text
                weather = li.select('p[class="wea"]')[0].text
                if x == 0:  # 为今天只有一个温度做判断 14℃
                    x += 1
                    temp = li.select('p[class="tem"] i')[0].text
                else:
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                print(date, weather, temp)
                # 22日(今天) 晴 14℃
                # 23日(明天) 晴 23℃/14℃
                # 24日(后天) 晴转多云 25℃/13℃
                # 25日(周六) 多云 21℃/13℃
                # 26日(周日) 多云转晴 22℃/12℃
                # 27日(周一) 晴 21℃/12℃
                # 28日(周二) 晴 24℃/14℃
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)
    

    4. 爬取与存储天气预报数据


    获取北京、上海、广州、深圳等城市的代码,爬取这些城市的天气预报数据,并存储到sqllite数据库weathers.db中,存储的数据表weathers是:

    create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))"

    编写程序依次爬取各个城市的天气预报数据存储在数据库中,程序如下:

    from bs4 import BeautifulSoup
    from bs4.dammit import UnicodeDammit
    import urllib.request
    import sqlite3
    
    
    # 天气数据库
    class WeatherDB:
        def __init__(self):
            self.cursor = None
            self.con = None
    
        def openDB(self):
            self.con = sqlite3.connect("weathers.db")
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute(
                    "create table weathers (wCity varchar(16),"
                    "wDate varchar(16),"
                    "wWeather varchar(64),"
                    "wTemp varchar(32),"
                    "constraint pk_weather primary key (wCity,wDate))")
            except Exception as err:
                print(err)
                self.cursor.execute("delete from weathers")
    
        def closeDB(self):
            self.con.commit()
            self.con.close()
    
        def insert(self, city, date, weather, temp):
            try:
                self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                    (city, date, weather, temp))
            except Exception as err:
                print(err)
    
        def show(self):
            self.cursor.execute("select * from weathers")
            rows = self.cursor.fetchall()
            print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
            for row in rows:
                print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
    
    
    # 天气预报
    class WeatherForecast:
        def __init__(self):
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) "
                              "Gecko/2008072421 Minefield/3.0.2pre"}
            self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
    
        def forecastCity(self, city):
            if city not in self.cityCode.keys():
                print(city + " 找不到代码")
                return
            url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
            try:
                req = urllib.request.Request(url, headers=self.headers)
                data = urllib.request.urlopen(req)
                data = data.read()
                dammit = UnicodeDammit(data, ["utf-8", "gbk"])
                data = dammit.unicode_markup
                soup = BeautifulSoup(data, "lxml")
                lis = soup.select("ul[class='t clearfix'] li")
                x = 0
                for li in lis:
                    try:
                        date = li.select('h1')[0].text
                        weather = li.select('p[class="wea"]')[0].text
                        if x == 0:  # 为今天只有一个温度做判断 14℃
                            x += 1
                            temp = li.select('p[class="tem"] i')[0].text
                        else:
                            temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                        print(city, date, weather, temp)
                        self.db.insert(city, date, weather, temp)
                    except Exception as err:
                        print(err)
            except Exception as err:
                print(err)
    
        def process(self, cities):
            self.db = WeatherDB()
            self.db.openDB()
    
            for city in cities:
                self.forecastCity(city)
    
            # self.db.show()
            self.db.closeDB()
    
    
    ws = WeatherForecast()
    ws.process(["北京", "上海", "广州", "深圳"])
    print("completed")

    程序执行结果如下:

    【爬虫】2.6 实践项目——爬取天气预报数据_第4张图片

你可能感兴趣的:(python,爬取天气预报数据)