创新点:结合时间序列预测(回归)与污染源分类(多标签分类),使用公开API获取实时数据。
OpenAQ
、AirNow
或国内公开的城市空气质量平台)。# 安装依赖库(部分需根据API调整)
pip install requests pandas numpy scikit-learn xgboost plotly folium
import requests
import pandas as pd
# 示例:从OpenAQ API获取数据(需替换为真实API密钥)
def fetch_air_quality(city="Beijing", days=30):
url = f"https://api.openaq.org/v2/measurements?city={city}¶meter=pm25&date_from=2023-01-01&limit=1000"
response = requests.get(url)
data = response.json()
# 转换为DataFrame并清洗
df = pd.DataFrame([{
'timestamp': item['date']['local'],
'pm25': item['value'],
'latitude': item['coordinates']['latitude'],
'longitude': item['coordinates']['longitude']
} for item in data['results']])
# 处理缺失值
df['pm25'].fillna(df['pm25'].median(), inplace=True)
return df
air_data = fetch_air_quality()
print(air_data.head())
# 时间特征提取
air_data['timestamp'] = pd.to_datetime(air_data['timestamp'])
air_data['hour'] = air_data['timestamp'].dt.hour
air_data['day_of_week'] = air_data['timestamp'].dt.dayofweek
# 添加滞后特征(用于时间序列预测)
air_data['pm25_lag6'] = air_data['pm25'].shift(6) # 6小时前的PM2.5值
air_data.dropna(inplace=True)
from sklearn.ensemble import RandomForestClassifier
# 假设根据专家规则生成标签(此处为示例,需结合实际数据)
def assign_pollution_source(row):
if row['pm25'] > 150 and row['no2'] > 40:
return 'industry'
elif row['pm10'] > 100 and row['wind_speed'] < 2:
return 'dust'
else:
return 'traffic'
# 加载真实数据后生成标签
# air_data['source'] = air_data.apply(assign_pollution_source, axis=1)
# 分类模型训练
X = air_data[['pm25', 'pm10', 'no2', 'so2', 'wind_speed']]
y = air_data['source']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print("Classification Accuracy:", clf.score(X_test, y_test))
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
# 特征与目标
features = ['pm25_lag6', 'hour', 'day_of_week', 'temperature', 'humidity']
target = 'pm25'
# 训练模型
reg = XGBRegressor()
reg.fit(X_train[features], y_train)
predictions = reg.predict(X_test[features])
print("MAE:", mean_absolute_error(y_test, predictions))
# 使用Plotly动态展示预测结果
import plotly.express as px
fig = px.line(x=X_test['timestamp'], y=[y_test, predictions],
labels={'value': 'PM2.5', 'variable': 'Type'},
title="Real vs Predicted PM2.5")
fig.show()
# 部署为简单API(Flask示例)
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route('/predict', methods=['POST'])
def predict():
data = request.json
prediction = reg.predict([data['features']])
return jsonify({"predicted_aqi": prediction[0]})
if __name__ == '__main__':
app.run()
folium
生成污染热力图)。Apache Airflow
定期更新数据)。