python实现空气质量指数爬取,输出csv文件

"""
空气质量指数计算方法

定量描述空气质量状况的无量纲指数

版本7.0
智能爬取网页数据
获取所有城市的AQI

bs = BeautifulSoup(
url,
html_parser,      指定解析器
enoding           指定编码格式(确保和网页编码一致)
)

bs.find_all('a')   按照类型查找节点

按照属性查找节点
bs.find_all('a',href = ‘a.html')
bs.find_all('a', href = ‘a.html',string = "next page' )
bs.find_all('a',class_ =‘a_link')    注意是,class_ 需要带下划线,class是特殊字符
bs.find_all('a',{‘class’ :‘a_link'})    或者这样的写法{ ‘’ }

"""
import requests
import bs4   # 导入beautifulsoup4
from bs4 import BeautifulSoup

def get_city_pm(city_pinyin):
    url = 'http://www.pm25x.com/' + city_pinyin
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')
    div_list = soup.find_all('td')
    pm25 = div_list[3].text.strip()
#  很多城市没有那么多测量数据
    pm = []
    pm.append(pm25 [:])
    pm25 = div_list[8].text.strip()
    pm.append(pm25[:])
    pm25 = div_list[13].text.strip()
    pm.append(pm25[:])
    pm25 = div_list[18].text.strip()
    pm.append(pm25[:])
    pm25 = div_list[23].text.strip()
    pm.append(pm25[:])
    pm25 = div_list[28].text.strip()
    pm.append(pm25[:])

    pm_ev = 0
    for i in range(len(pm)):
        if pm[i] == '--':
            pass
        else:
            pm_ev += int(pm[i])      # 总共的PM2.5的值

    return pm, pm_ev

def get_city_aqi(city_pinyin):
    """
    获取城市的AQI
    """
    url = 'http://www.pm25x.com/' + city_pinyin
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')    # 'lxml'解码方式
    div_list = soup.find_all('div', {'class': "aqivalue"})                 # 这里得到的是列表,即使只有一个数据,也是列表,所以后面要注意角标的问题
    aqi = div_list[0].text.strip()                                         # div_list如果不加[0],会一直报错
    aqileveltext = soup.find_all('div', {'class': "aqileveltext"})
    aqilevel = aqileveltext[0].text.strip()

    return aqi, aqilevel


def get_all_cities():
    """
    获取所有城市
    """
    url = 'http://www.pm25x.com'
    city_list = []
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')
    div_list = soup.find_all("dl", {"class": "citylist"})
    citys_div = div_list[0].find_all("a")
    for city_div in citys_div:
        city_name = city_div.text
        city_pinyin = city_div['href'][1:]          # 在网页源代码中能够获取到   城市拼音.htm
        city_list.append((city_name, city_pinyin))

    return city_list
def main():
    """
    主函数

    """
    city_list = get_all_cities()
    for city in city_list:
        city_name = city[0]
        city_pinyin = city[1]
        city_aqi = get_city_aqi(city_pinyin)
        # 不断循环输出  跳出循环的话只有最后一个结果
        city_pm = get_city_pm(city_pinyin)
        print(city_name, city_aqi, city_pm)

if __name__ == '__main__':
    main()

"""
空气质量指数计算方法

定量描述空气质量状况的无量纲指数

版本8.0
智能爬取网页数据
获取所有城市的AQI
输出csv文件
"""
import requests
import bs4   # 导入beautifulsoup4
from bs4 import BeautifulSoup
import csv
def get_city_pm(city_pinyin):
    url = 'http://www.pm25x.com/' + city_pinyin
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')
    div_list = soup.find_all('td')
    pm25 = div_list[3].text.strip()

    #  很多城市没有那么多测量数据
    pm = []
    pm.append(pm25[:])

    return pm

def get_city_aqi(city_pinyin):
    """
    获取城市的AQI
    """
    url = 'http://www.pm25x.com/' + city_pinyin
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')    # 'lxml'解码方式
    div_list = soup.find_all('div', {'class': "aqivalue"})                 # 这里得到的是列表,即使只有一个数据,也是列表,所以后面要注意角标的问题
    aqi = div_list[0].text.strip()                                         # div_list如果不加[0],会一直报错
    # aqileveltext = soup.find_all('div', {'class': "aqileveltext"})
    # aqilevel = aqileveltext[0].text.strip()

    return aqi


def get_all_cities():
    """
    获取所有城市
    """
    url = 'http://www.pm25x.com'
    city_list = []
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')
    div_list = soup.find_all("dl", {"class": "citylist"})
    citys_div = div_list[0].find_all("a")
    for city_div in citys_div:
        city_name = city_div.text
        city_pinyin = city_div['href'][1:]          # 在网页源代码中能够获取到   城市拼音.htm
        city_list.append((city_name, city_pinyin))

    return city_list

def main():
    """
    主函数

    """
    city_list = get_all_cities()
    # CSV文件的第一行列表
    header = ['City', 'AQI', 'PM2.5']
    # with操作,建立csv文件
    with open('China_city_csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        for i, city in enumerate(city_list):
            if (i+1) % 10 == 0:
                print('已处理{}条记录。(共{}条记录)'.format(i+1,len(city_list)))

            city_name = city[0]
            city_pinyin = city[1]
            city_aqi = get_city_aqi(city_pinyin)
            city_pm = get_city_pm(city_pinyin)
            row = [city_name] + [city_aqi] + city_pm
            writer.writerow(row)

if __name__ == '__main__':
    main()

"""
空气质量指数计算方法

定量描述空气质量状况的无量纲指数

版本9.0
智能爬取网页数据
获取所有城市的AQI
输出csv文件

Pandas模块学习

"""
import pandas as pd

def main():
    """
    主函数
    """
    aqi_data = pd.read_csv('China_city_csv')
    print(aqi_data)   # 不填写读取效果不同于下面的语句
    print(aqi_data.head(4))
    print(aqi_data.tail(8))

    print("******************************")
    print('基本信息:', aqi_data.info())
    print('^^^^^^^^^^^^^^^^^^^^^^^^^')
    print(aqi_data["City"])
    print('((((((((((())))))))))))))))))')
    print(aqi_data[["City", "AQI"]].head(10))
    print("__________________")

    print('AQI最大值:', aqi_data['AQI'].max())
    print("!!!!!!!!!!!!!!!!!!!!!")

    print('AQI最小值:', aqi_data['AQI'].min())
    print('############################')
    print('AQI均值:', aqi_data['AQI'].mean())
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%')

    # top10   排序
    top10_cities = aqi_data.sort_values(by=['AQI']).head(10)
    print(top10_cities)

    # buttom10
    Bottom10_cities = aqi_data.sort_values(by=['AQI'], ascending=False).head(10)
    # Buttom10_cities = aqi_data.sort_values(by=['AQI']).tail(10)
    # 与上面语句结果相同 数据顺序不同
    print(Bottom10_cities)

    # 保存csv文件 index=False 指不需要索引号码
    top10_cities.to_csv('top10_aqi.csv', index=False)
    Bottom10_cities.to_csv('Bottom10_cities.csv', index=False)

if __name__ == '__main__':
    main()

"""
空气质量指数计算方法

定量描述空气质量状况的无量纲指数

版本10.0
智能爬取网页数据
获取所有城市的AQI
输出csv文件

Pandas模块学习
数据过滤

"""
import pandas as pd

import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

def main():
    """
    主函数
    """
    aqi_data = pd.read_csv('China_city_csv')
    print(aqi_data)   # 不填写读取效果不同于下面的语句
    print(aqi_data.head(4))
    print(aqi_data.tail(8))

    print("******************************")
    print('基本信息:', aqi_data.info())

    # 数据清洗,只保留AQI>0的数据
    # filter_condition = aqi_data['AQI'] > 0
    # clean_aqi_data = aqi_data[filter_condition]
    clean_aqi_data = aqi_data[aqi_data['AQI'] > 0]
    print(clean_aqi_data)
    # aqi_data['PM2.5']的数据类型是str类型,无法进行比较大小
    # clean_pm25_data = aqi_data[int(aqi_data['PM2.5']) > 0]
    # print(clean_pm25_data)

    print('^^^^^^^^^^^^^^^^^^^^^^^^^')
    print(clean_aqi_data["City"])
    print('((((((((((())))))))))))))))))')
    print(clean_aqi_data[["City", "AQI"]].head(10))
    print("__________________")

    print('AQI最大值:', aqi_data['AQI'].max())
    print("!!!!!!!!!!!!!!!!!!!!!")

    print('AQI最小值:', aqi_data['AQI'].min())
    print('############################')
    print('AQI均值:', aqi_data['AQI'].mean())
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%')

    # top50  排序
    top50_cities = aqi_data.sort_values(by=['AQI']).head(50)
    # plot函数是基于matploylib的pandas的模块函数
    # plot(kind, x, y ,title, figsize)     kind 代表图的类型是柱状图、线图等  kind='bar'柱状图
    # x,y  横纵坐标对应的数据列    title图像名称   figsize图像尺寸
    # plt.savefig()
    """
    #这个kind可以指定图表类型

    ‘line’ : line plot (default)
    ‘bar’ : vertical bar plot
    ‘barh’ : horizontal bar plot
    ‘hist’ : histogram
    ‘box’ : boxplot
    ‘kde’ : Kernel Density Estimation plot
    ‘density’ : same as ‘kde’
    ‘area’ : area plot
    ‘pie’ : pie plot
    """
    top50_cities.plot(kind='bar', x='City', y='AQI', title='AQI水平最好的50个城市',
                      figsize=(20, 10))

    # 保存图片:plt.savefig()
    plt.savefig('top50_aqi_bar.png')   # 保存成图像
    plt.show()


if __name__ == '__main__':
    main()

你可能感兴趣的:(学习)