The dataset contains 517 fires from the Montesinho natural park in Portugal. For each incident weekday, month, coordinates, and the burnt area are recorded, as well as several meteorological data such as rain, temperature, humidity, and wind. The workflow reads the data and trains a regression model based on the spatial, temporal, and weather variables.
该数据集包含来自葡萄牙蒙特西尼奥自然公园的 517 起火灾。记录每个事件的工作日、月份、坐标和烧伤区域,以及雨、温度、湿度和风等多个气象数据。工作流读取数据并根据空间、时间和天气变量训练回归模型。
Forest Fires Data Set
Forest Fires Data Set----predict the burned area of forest fires using meteorological and other data
加拿大森林火险气候指数系统FWI的原理及应用
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
fires = pd.read_csv('forestfires.csv')
fires = fires.reset_index()
mapping_month = {'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12,}
fires['month'] = fires['month'].map(mapping_month)
mapping_day = {'mon':1,'tue':2,'wed':3,'thu':4,'fri':5,'sat':6,'sun':0}
fires['day'] = fires['day'].map(mapping_day)
fires.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
index | 517.0 | 258.000000 | 149.389312 | 0.0 | 129.0 | 258.00 | 387.00 | 516.00 |
X | 517.0 | 4.669246 | 2.313778 | 1.0 | 3.0 | 4.00 | 7.00 | 9.00 |
Y | 517.0 | 4.299807 | 1.229900 | 2.0 | 4.0 | 4.00 | 5.00 | 9.00 |
month | 517.0 | 7.475822 | 2.275990 | 1.0 | 7.0 | 8.00 | 9.00 | 12.00 |
day | 517.0 | 2.972921 | 2.143867 | 0.0 | 1.0 | 3.00 | 5.00 | 6.00 |
FFMC | 517.0 | 90.644681 | 5.520111 | 18.7 | 90.2 | 91.60 | 92.90 | 96.20 |
DMC | 517.0 | 110.872340 | 64.046482 | 1.1 | 68.6 | 108.30 | 142.40 | 291.30 |
DC | 517.0 | 547.940039 | 248.066192 | 7.9 | 437.7 | 664.20 | 713.90 | 860.60 |
ISI | 517.0 | 9.021663 | 4.559477 | 0.0 | 6.5 | 8.40 | 10.80 | 56.10 |
temp | 517.0 | 18.889168 | 5.806625 | 2.2 | 15.5 | 19.30 | 22.80 | 33.30 |
RH | 517.0 | 44.288201 | 16.317469 | 15.0 | 33.0 | 42.00 | 53.00 | 100.00 |
wind | 517.0 | 4.017602 | 1.791653 | 0.4 | 2.7 | 4.00 | 4.90 | 9.40 |
rain | 517.0 | 0.021663 | 0.295959 | 0.0 | 0.0 | 0.00 | 0.00 | 6.40 |
area | 517.0 | 12.847292 | 63.655818 | 0.0 | 0.0 | 0.52 | 6.57 | 1090.84 |
fires['area'][fires['area']<=0.09] = 0
fires['area'][(fires['area']>0.09) & (fires['area']<=6.57)] = 1
fires['area'][(fires['area']>6.57) & (fires['area']<=279)] = 2
fires['area'][fires['area']>279] = 3
attributes = ['month','day','FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH','wind','rain']
corr = fires[attributes].corr()
corr
month | day | FFMC | DMC | DC | ISI | temp | RH | wind | rain | |
---|---|---|---|---|---|---|---|---|---|---|
month | 1.000000 | -0.037469 | 0.291477 | 0.466645 | 0.868698 | 0.186597 | 0.368842 | -0.095280 | -0.086368 | 0.013438 |
day | -0.037469 | 1.000000 | 0.073597 | 0.028697 | 0.001913 | 0.035926 | 0.032233 | -0.083318 | -0.004013 | -0.024119 |
FFMC | 0.291477 | 0.073597 | 1.000000 | 0.382619 | 0.330512 | 0.531805 | 0.431532 | -0.300995 | -0.028485 | 0.056702 |
DMC | 0.466645 | 0.028697 | 0.382619 | 1.000000 | 0.682192 | 0.305128 | 0.469594 | 0.073795 | -0.105342 | 0.074790 |
DC | 0.868698 | 0.001913 | 0.330512 | 0.682192 | 1.000000 | 0.229154 | 0.496208 | -0.039192 | -0.203466 | 0.035861 |
ISI | 0.186597 | 0.035926 | 0.531805 | 0.305128 | 0.229154 | 1.000000 | 0.394287 | -0.132517 | 0.106826 | 0.067668 |
temp | 0.368842 | 0.032233 | 0.431532 | 0.469594 | 0.496208 | 0.394287 | 1.000000 | -0.527390 | -0.227116 | 0.069491 |
RH | -0.095280 | -0.083318 | -0.300995 | 0.073795 | -0.039192 | -0.132517 | -0.527390 | 1.000000 | 0.069410 | 0.099751 |
wind | -0.086368 | -0.004013 | -0.028485 | -0.105342 | -0.203466 | 0.106826 | -0.227116 | 0.069410 | 1.000000 | 0.061119 |
rain | 0.013438 | -0.024119 | 0.056702 | 0.074790 | 0.035861 | 0.067668 | 0.069491 | 0.099751 | 0.061119 | 1.000000 |
from pandas.plotting import scatter_matrix
attributes = ['FFMC', 'DMC', 'DC', 'ISI']
scatter_matrix(fires[attributes],figsize=(15, 15))
fires.plot(kind="scatter", x="DMC", y="DC", alpha=0.4, figsize=(10,8))
from sklearn.ensemble import ExtraTreesRegressor
columns = ['X', 'Y','month','day','FFMC', 'DMC', 'DC', 'ISI', 'temp','RH', 'wind', 'rain']
X = fires[columns]
Y = fires[['area']].values.ravel()
model = ExtraTreesRegressor(n_estimators=100)
model.fit(X, Y)
ExtraTreesRegressor()
cols_to_drop = []
for c in zip(columns,model.feature_importances_.round(4)):
if c[1] <0.01:
cols_to_drop.append(c[0])
print('Columns to be droped: ',cols_to_drop)
Columns to be droped: ['rain']
corr_matrix = fires.corr()
corr_matrix["area"].sort_values(ascending=False)
area 1.000000
index 0.302303
month 0.123613
wind 0.070217
X 0.068824
DC 0.063159
FFMC 0.059142
Y 0.047538
DMC 0.046503
rain 0.043600
temp 0.042614
ISI 0.022006
day 0.004167
RH -0.054193
Name: area, dtype: float64
fires = fires.drop(cols_to_drop,axis=1)
fires.drop(labels=['DC'],axis=1,inplace=True)
import plotly.express as px
df_long=pd.melt(fires,id_vars=['index'], value_vars=['FFMC', 'DMC'])
fig = px.line(df_long, x='index', y='value', color='variable')
fig.show()
df_long=pd.melt(fires,id_vars=['index'], value_vars=['ISI', 'temp', 'wind'])
fig = px.line(df_long, x='index', y='value', color='variable')
fig.show()
fires_cat = fires[['month', 'day']]
fires_num = fires[['X', 'Y', 'FFMC', 'DMC', 'ISI', 'temp', 'RH','wind']]
target = fires[['area']]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([('std_scaler', StandardScaler()),])
fires_num_tr = num_pipeline.fit_transform(fires_num)
from sklearn.model_selection import train_test_split
data = np.concatenate((fires_cat,fires_num_tr),axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, target.values, test_size=0.3)
y_train = y_train.ravel()
y_test = y_test.ravel()
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
param_grid = [{'kernel': ['rbf', 'sigmoid'], 'C': [1,50, 100 ,300], 'epsilon': [0.2, 0.2,0.1]},]
svr_cv =SVR()
svr_grid_search = GridSearchCV(svr_cv, param_grid, cv=5,scoring='neg_mean_squared_error',return_train_score=True)
svr_grid_search.fit(X_train,y_train)
GridSearchCV(cv=5, estimator=SVR(),
param_grid=[{'C': [1, 50, 100, 300], 'epsilon': [0.2, 0.2, 0.1],
'kernel': ['rbf', 'sigmoid']}],
return_train_score=True, scoring='neg_mean_squared_error')
svr_grid_search.best_estimator_
SVR(C=1, epsilon=0.2)
final_model = svr_grid_search.best_estimator_
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print('和均方误差SMSE为: ',final_rmse)
print('平均绝对误差MAE为: {}'.format(mean_absolute_error(y_test, final_predictions)))
和均方误差SMSE为: 0.9193740894801092
平均绝对误差MAE为: 0.7773110570888362
final_predictions[final_predictions==0] = 0
final_predictions[(final_predictions>0) & (final_predictions<=1)] = 1
final_predictions[(final_predictions>1) & (final_predictions<=2)] = 2
final_predictions[final_predictions>2] = 3
right_num = 0
for index in range(len(final_predictions)):
if y_test[index] == final_predictions[index]:
right_num = right_num + 1
right = right_num / len(final_predictions) * 100
print('准确率为:', right)
准确率为: 25.64102564102564
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False,True], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
rfr_grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
rfr_grid_search.fit(X_train,y_train)
GridSearchCV(cv=5, estimator=RandomForestRegressor(),
param_grid=[{'max_features': [2, 4, 6, 8],
'n_estimators': [3, 10, 30]},
{'bootstrap': [False, True], 'max_features': [2, 3, 4],
'n_estimators': [3, 10]}],
return_train_score=True, scoring='neg_mean_squared_error')
rfr_grid_search.best_estimator_
RandomForestRegressor(max_features=4, n_estimators=10)
final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print('和均方误差SMSE为: ',final_rmse)
print('平均绝对误差MAE为: {}'.format(mean_absolute_error(y_test, final_predictions)))
和均方误差SMSE为: 0.9193740894801092
平均绝对误差MAE为: 0.7773110570888362
final_predictions[final_predictions==0] = 0
final_predictions[(final_predictions>0) & (final_predictions<=1)] = 1
final_predictions[(final_predictions>1) & (final_predictions<=2)] = 2
final_predictions[final_predictions>2] = 3
right_num = 0
for index in range(len(final_predictions)):
if y_test[index] == final_predictions[index]:
right_num = right_num + 1
right = right_num / len(final_predictions) * 100
print('准确率为:', right)
准确率为: 25.64102564102564
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
h2o.init()
h2oFires = pd.read_csv('forestfires.csv')
h2oFires['area'][h2oFires['area']<=0.09] = 0
h2oFires['area'][(h2oFires['area']>0.09) & (h2oFires['area']<=6.57)] = 1
h2oFires['area'][(h2oFires['area']>6.57) & (h2oFires['area']<=279)] = 2
h2oFires['area'][h2oFires['area']>279] = 3
h2oFires['area'][h2oFires['area']==0] = 'fire0'
h2oFires['area'][h2oFires['area']==1] = 'fire1'
h2oFires['area'][h2oFires['area']==2] = 'fire2'
h2oFires['area'][h2oFires['area']==3] = 'fire3'
trainCsv = h2oFires.sample(frac=0.7,axis=0)
testCsv = h2oFires.sample(frac=0.3,axis=0)
trainCsv = trainCsv[['X','Y','month','day','FFMC','DMC','ISI','temp','RH','wind','area']]
testCsv = testCsv[['X','Y','month','day','FFMC','DMC','ISI','temp','RH','wind','area']]
trainCsv.to_csv('h2oTrain.csv')
testCsv.to_csv('h2oTest.csv')
train=h2o.import_file("h2oTrain.csv")
test=h2o.import_file("h2oTest.csv")
train=train[1:]
test=test[1:]
model1 = H2ORandomForestEstimator()
model1.train(x = train.names[0:-1],y = 'area',training_frame = train)
predict=H2ORandomForestEstimator.predict(model1 ,test[test.names[0:-1]])
predict
out = test.concat(predict)
h2o.download_csv(out,"predict.csv")
test_right = predict[predict['predict'] == test['area']].nrow
accuracy = test_right/test.nrow
print('准确率为:', accuracy*100)
准确率为: 82.58064516129032
rf_params = {'ntrees': [x for x in range(100,200,1)],'max_depth': [50] }
rf_grid = H2OGridSearch(model = H2ORandomForestEstimator, hyper_params=rf_params)
rf_grid.train(x = train.names[0:-1],y = 'area',training_frame = train)
model4 = H2ORandomForestEstimator(ntrees=100,max_depth=50)
model4.train(x = train.names[0:-1],y = 'area',training_frame = train)
predict=H2ORandomForestEstimator.predict(model4,test[test.names[0:-1]])
test_right = predict[predict['predict'] == test['area']].nrow
accuracy = test_right/test.nrow
print('准确率为:', accuracy*100)
准确率为: 85.16129032258064