"""
文件名: main02.py
功能: 主程序
中国五大城市PM2.5数据分析
任务:
- 统计每个城市每天的平均PM2.5的数值
- 基于天数对比中国环保部和美国驻华大使馆统计的污染状态
数据集来源:https://www.kaggle.com/uciml/pm25-data-for-five-chinese-cities
"""
import os
import pandas as pd
import numpy as np
import config
def preprocess_data(data_df, city_name):
"""
预处理数据集
参数:
- data_df: 数据DataFrame
- city_name: 城市名
返回:
- cln_data_df: 预处理后的数据集
"""
# 数据清洗,去掉存在空值的行
cln_data_df = data_df.dropna()
# 重新构建索引
cln_data_df = cln_data_df.reset_index(drop=True)
# 添加新的一列作为城市名
cln_data_df['city'] = city_name
# 输出信息
print('{}共有{}行数据,其中有效数据为{}行'.format(city_name, data_df.shape[0], cln_data_df.shape[0]))
print('{}的前10行有效数据:'.format(city_name))
print(cln_data_df.head())
return cln_data_df
def get_china_us_pm_df(data_df, suburb_cols):
"""
处理获取中国与美国统计的PM数据
参数:
- data_df: 包含城市PM值的DataFrame
- suburb_cols: 城市对应区的列名
返回:
- proc_data_df: 处理后的DataFrame
"""
pm_suburb_cols = ['PM_' + col for col in suburb_cols]
# 取PM的均值为中国环保部在该城市的测量值
data_df['PM_China'] = data_df[pm_suburb_cols].mean(axis=1)
# 取出有用的列构建新的DataFrame
proc_data_df = data_df[config.common_cols + ['city', 'PM_China']]
# 数据预览
print('处理后的数据预览:')
print(proc_data_df.head())
return proc_data_df
def add_date_col_to_df(data_df):
"""
将'year', 'month', 'day'合并成字符串列'date'
参数:
- data_df: 数据DataFrame
返回:
- proc_data_df: 处理后的数据集
"""
proc_data_df = data_df.copy()
# 将'year', 'month', 'day'合并成字符串列'date'
# 转换数据类型
proc_data_df[['year', 'month', 'day']] = proc_data_df[['year', 'month', 'day']].astype('str')
# 合并列
proc_data_df['date'] = proc_data_df['year'].str.cat([proc_data_df['month'], proc_data_df['day']], sep='-')
# 去除列
proc_data_df = proc_data_df.drop(['year', 'month', 'day'], axis=1)
# 调整列的顺序
proc_data_df = proc_data_df[['date', 'city', 'PM_China', 'PM_US Post']]
return proc_data_df
def add_polluted_state_col_to_df(day_stats):
"""
根据每天的PM值,添加相关的污染状态
参数:
- day_stats: 数据DataFrame
返回:
- proc_day_stats: 处理后的数据集
"""
proc_day_stats = day_stats.copy()
bins = [-np.inf, 35, 75, 150, np.inf]#分层
state_lablels = ['good', 'light', 'medium', 'heavy']
proc_day_stats['Polluted State CH'] = pd.cut(proc_day_stats['PM_China'], bins=bins, labels=state_lablels)
proc_day_stats['Polluted State US'] = pd.cut(proc_day_stats['PM_US Post'], bins=bins, labels=state_lablels)
return proc_day_stats
def compare_state_by_day(day_stats):
"""
基于天数对比中国环保部和美国驻华大使馆统计的污染状态
"""
city_names = config.data_config_dict.keys()
city_comparison_list = []
for city_name in city_names:
# 找出city_name的相关数据
city_df = day_stats[day_stats['city'] == city_name]
# 统计类别个数
city_polluted_days_count_ch = pd.value_counts(city_df['Polluted State CH']).to_frame(name=city_name + '_CH')
city_polluted_days_count_us = pd.value_counts(city_df['Polluted State US']).to_frame(name=city_name + '_US')
city_comparison_list.append(city_polluted_days_count_ch)
city_comparison_list.append(city_polluted_days_count_us)
# 横向组合DataFrame
comparison_result = pd.concat(city_comparison_list, axis=1)
return comparison_result
def main():
"""
主函数
"""
city_data_list = []
for city_name, (filename, suburb_cols) in config.data_config_dict.items():
# === 1. 数据获取 ===
data_file = os.path.join(config.dataset_path, filename)
usecols = config.common_cols + ['PM_' + col for col in suburb_cols]
# 读入数据
data_df = pd.read_csv(data_file,usecols=usecols)
# === 2. 数据处理 ===
# 数据预处理
cln_data_df = preprocess_data(data_df, city_name)
# 处理获取中国与美国统计的PM数据
proc_data_df = get_china_us_pm_df(cln_data_df, suburb_cols)
city_data_list.append(proc_data_df)
print()
# 合并5个城市的处理后的数据
all_data_df = pd.concat(city_data_list)
# 将'year', 'month', 'day'合并成字符串列'date'
all_data_df = add_date_col_to_df(all_data_df)
# === 数据分析 ===
# 通过分组操作获取每个城市每天的PM均值
# 统计每个城市每天的平均PM2.5的数值
day_stats = all_data_df.groupby(['city', 'date'])[['PM_China', 'PM_US Post']].mean()
# print(day_stats)
# 分组操作后day_stats的索引为层级索引['city', 'date'],
# 为方便后续分析,将层级索引转换为普通列
day_stats.reset_index(inplace=True)
# print(day_stats)
# 根据每天的PM值,添加相关的污染状态
day_stats = add_polluted_state_col_to_df(day_stats)
# 基于天数对比中国环保部和美国驻华大使馆统计的污染状态
comparison_result = compare_state_by_day(day_stats)
print(comparison_result)
# === 结果展示 ===
all_data_df.to_csv(os.path.join(config.output_path, 'all_cities_pm.csv'), index=False)
day_stats.to_csv(os.path.join(config.output_path, 'day_stats.csv'))
comparison_result.to_csv(os.path.join(config.output_path, 'comparison_result.csv'))
if __name__ == '__main__':
main()
=================================================
/usr/local/bin/python3.6 /Users/apple/PycharmProjects/interview_lec02/main.py
beijing共有52584行数据,其中有效数据为19380行
beijing的前10行有效数据:
year month day ... PM_Nongzhanguan PM_US Post city
0 2013 3 5 ... 140.0 150.0 beijing
1 2013 3 5 ... 152.0 163.0 beijing
2 2013 3 5 ... 128.0 172.0 beijing
3 2013 3 5 ... 3.0 181.0 beijing
4 2013 3 5 ... 3.0 187.0 beijing
[5 rows x 8 columns]
处理后的数据预览:
year month day PM_US Post city PM_China
0 2013 3 5 150.0 beijing 141.000000
1 2013 3 5 163.0 beijing 149.333333
2 2013 3 5 172.0 beijing 147.333333
3 2013 3 5 181.0 beijing 118.000000
4 2013 3 5 187.0 beijing 113.666667
beijing_CH beijing_US ... shenyang_CH shenyang_US
good 213 205 ... 0 0
heavy 140 144 ... 0 0
light 291 299 ... 0 0
medium 286 282 ... 0 0
[4 rows x 10 columns]
chengdu共有52584行数据,其中有效数据为22290行
chengdu的前10行有效数据:
year month day PM_Caotangsi PM_Shahepu PM_US Post city
0 2013 1 1 121.0 138.0 129.0 chengdu
1 2013 1 1 134.0 159.0 135.0 chengdu
2 2013 1 1 159.0 181.0 132.0 chengdu
3 2013 1 1 168.0 184.0 100.0 chengdu
4 2013 1 1 143.0 162.0 101.0 chengdu
处理后的数据预览:
year month day PM_US Post city PM_China
0 2013 1 1 129.0 chengdu 129.5
1 2013 1 1 135.0 chengdu 146.5
2 2013 1 1 132.0 chengdu 170.0
3 2013 1 1 100.0 chengdu 176.0
4 2013 1 1 101.0 chengdu 152.5
beijing_CH beijing_US ... shenyang_CH shenyang_US
good 213 205 ... 0 0
heavy 140 144 ... 0 0
light 291 299 ... 0 0
medium 286 282 ... 0 0
[4 rows x 10 columns]
guangzhou共有52584行数据,其中有效数据为20074行
guangzhou的前10行有效数据:
year month day ... PM_5th Middle School PM_US Post city
0 2013 1 1 ... 78.0 83.0 guangzhou
1 2013 1 1 ... 70.0 95.0 guangzhou
2 2013 1 1 ... 66.0 55.0 guangzhou
3 2013 1 1 ... 69.0 60.0 guangzhou
4 2013 1 1 ... 51.0 41.0 guangzhou
[5 rows x 7 columns]
处理后的数据预览:
year month day PM_US Post city PM_China
0 2013 1 1 83.0 guangzhou 80.5
1 2013 1 1 95.0 guangzhou 82.5
2 2013 1 1 55.0 guangzhou 60.5
3 2013 1 1 60.0 guangzhou 64.5
4 2013 1 1 41.0 guangzhou 46.0
beijing_CH beijing_US ... shenyang_CH shenyang_US
good 213 205 ... 0 0
heavy 140 144 ... 0 0
light 291 299 ... 0 0
medium 286 282 ... 0 0
[4 rows x 10 columns]
shanghai共有52584行数据,其中有效数据为23291行
shanghai的前10行有效数据:
year month day PM_Jingan PM_US Post PM_Xuhui city
0 2013 1 1 66.0 70.0 71.0 shanghai
1 2013 1 1 67.0 76.0 72.0 shanghai
2 2013 1 1 73.0 78.0 74.0 shanghai
3 2013 1 1 75.0 77.0 77.0 shanghai
4 2013 1 1 73.0 78.0 80.0 shanghai
处理后的数据预览:
year month day PM_US Post city PM_China
0 2013 1 1 70.0 shanghai 68.5
1 2013 1 1 76.0 shanghai 69.5
2 2013 1 1 78.0 shanghai 73.5
3 2013 1 1 77.0 shanghai 76.0
4 2013 1 1 78.0 shanghai 76.5
beijing_CH beijing_US ... shenyang_CH shenyang_US
good 213 205 ... 0 0
heavy 140 144 ... 0 0
light 291 299 ... 0 0
medium 286 282 ... 0 0
[4 rows x 10 columns]
shenyang共有52584行数据,其中有效数据为20181行
shenyang的前10行有效数据:
year month day PM_Taiyuanjie PM_US Post PM_Xiaoheyan city
0 2013 4 22 141.0 113.0 140.0 shenyang
1 2013 4 23 59.0 59.0 49.0 shenyang
2 2013 4 23 200.0 106.0 152.0 shenyang
3 2013 4 24 120.0 136.0 137.0 shenyang
4 2013 4 24 11.0 23.0 10.0 shenyang
处理后的数据预览:
year month day PM_US Post city PM_China
0 2013 4 22 113.0 shenyang 140.5
1 2013 4 23 59.0 shenyang 54.0
2 2013 4 23 106.0 shenyang 176.0
3 2013 4 24 136.0 shenyang 128.5
4 2013 4 24 23.0 shenyang 10.5
beijing_CH beijing_US ... shenyang_CH shenyang_US
good 213 205 ... 207 205
heavy 140 144 ... 79 89
light 291 299 ... 385 388
medium 286 282 ... 256 245
[4 rows x 10 columns]
Process finished with exit code 0