以下算法所需训练数据格式:
决策树回归算法:
import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False
def loadData(file, y_label):
df = pd.read_csv(file)
col_name = list(df.columns) # 获取所有列名
x_col = col_name
col_drop = [y_label]
for i in col_drop:
x_col.remove(i)
X = df[x_col].values
# 取得标签
Y = df[y_label].values
labels = df[x_col].columns
print('自变量:', x_col)
return X, Y, labels
def splitData(X, Y, test_size=0.3):
row = X.shape[0]
x_train = X[0:round(row * (1 - test_size))]
y_train = Y[0:round(row * (1 - test_size))]
x_test = X[round(row * (1 - test_size)):]
y_test = Y[round(row * (1 - test_size)):]
# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
return x_train, x_test, y_train, y_test
def plot_scatter(x, y):
# 解决坐标轴刻度负号乱码
plt.rcParams['axes.unicode_minus'] = False
# 解决中文乱码问题
plt.rcParams['font.sans-serif'] = ['Simhei']
plt.scatter(x, y)
plt.show()
# 开始用np.polyfit 函数来拟合成一条3次曲线
para = np.polyfit(x, y, 3)
# 输出结果为3次方程的参数
y2 = para[0] * x ** 3 + para[1] * x ** 2 + para[2] * x + para[3]
# 将拟合后的结果打印
plt.scatter(x, y)
plt.plot(x, y2, 'r')
plt.show()
def Model(x_train, x_test, y_train, y_test, modelPath):
model = DecisionTreeRegressor(random_state=0)
model.fit(x_train, y_train)
modelPath = modelPath + '/' + 'model_DT.pkl'
with open(modelPath, 'wb') as f:
pickle.dump(model, f)
a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
b = np.sum((y_test - np.average(y_test)) ** 2)
c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
d = np.sum((y_train - np.average(y_train)) ** 2)
r2_train = c / d
r2_test = a / b
print("*********************Decision Tree Regressor Model*********************")
print("******************************训练集精度*********************************")
print('R2:', r2_train)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))
print("*****************************测试集精度*********************************")
print('R2:', r2_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))
# 绘图
plot_scatter(y_test, model.predict(x_test))
# x = np.arange(len(y_test))
# plt.style.use('ggplot')
# plt.scatter(x, y_test, s=10, label="实际数据点", )
# plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
# plt.xlabel("样本")
# plt.ylabel("value")
# plt.grid()
# plt.legend()
# plt.show()
if __name__ == '__main__':
trainDataPath = r'./data/剔除异常值.csv'
modelSavePath = './models'
y_label = 'chl'
X, Y, labels = loadData(trainDataPath, 'chl')
x_train, x_test, y_train, y_test = splitData(X, Y)
Model(x_train, x_test, y_train, y_test, modelSavePath)
多项式回归算法:
import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import sklearn.pipeline as pl
import sklearn.preprocessing as sp
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False
def loadData(file, y_label):
df = pd.read_csv(file)
col_name = list(df.columns) # 获取所有列名
x_col = col_name
col_drop = [y_label]
for i in col_drop:
x_col.remove(i)
X = df[x_col].values
# 取得标签
Y = df[y_label].values
# sca = MinMaxScaler()
# Y = sca.fit_transform(Y.reshape(1, -1))
# print(Y)
labels = df[x_col].columns
print('自变量:', x_col)
return X, Y, labels
def splitData(X, Y):
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
return x_train, x_test, y_train, y_test
def Model(degree, x_train, x_test, y_train, y_test, modelPath):
model = pl.make_pipeline(
sp.PolynomialFeatures(degree), # 多项式特征扩展器
lm.LinearRegression() # 线性回归器
)
model.fit(x_train, y_train)
modelPath = modelPath + '/' + 'model_duoxiangshi.pkl'
with open(modelPath, 'wb') as f:
pickle.dump(model, f)
a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
b = np.sum((y_test - np.average(y_test)) ** 2)
c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
d = np.sum((y_train - np.average(y_train)) ** 2)
r2_train = c / d
r2_test = a / b
print("**************************duoxianshi Regressor Model***************************")
print("*****************************训练集精度*********************************")
print('R2:', r2_train)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))
print("*****************************测试集精度*********************************")
print('R2:', r2_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))
# 绘图
x = np.arange(len(y_test))
plt.style.use('ggplot')
plt.scatter(x, y_test, s=10, label="实际数据点", )
plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
plt.xlabel("样本")
plt.ylabel("value")
plt.grid()
plt.legend()
plt.show()
# return rf
if __name__ == '__main__':
trainDataPath = r'./data/剔除异常值.csv'
modelSavePath = './models'
# 多项式最高次数
degree = 3
y_label = 'chl'
X, Y, labels = loadData(trainDataPath, 'chl')
x_train, x_test, y_train, y_test = splitData(X, Y)
Model(degree, x_train, x_test, y_train, y_test, modelSavePath)
KNN回归算法:
import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False
def loadData(file, y_label):
df = pd.read_csv(file)
col_name = list(df.columns) # 获取所有列名
x_col = col_name
col_drop = [y_label]
for i in col_drop:
x_col.remove(i)
X = df[x_col].values
# 取得标签
Y = df[y_label].values
labels = df[x_col].columns
print('自变量:', x_col)
return X, Y, labels
def splitData(X, Y):
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
return x_train, x_test, y_train, y_test
def Model(x_train, x_test, y_train, y_test, modelPath):
model = KNeighborsRegressor(n_neighbors=2)
model.fit(x_train, y_train)
modelPath = modelPath + '/' + 'model_knn.pkl'
with open(modelPath, 'wb') as f:
pickle.dump(model, f)
a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
b = np.sum((y_test - np.average(y_test)) ** 2)
c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
d = np.sum((y_train - np.average(y_train)) ** 2)
r2_train = c / d
r2_test = a / b
print("************************KNN Regressor Model****************************")
print("*****************************训练集精度*********************************")
print('R2:', r2_train)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))
print("*****************************测试集精度*********************************")
print('R2:', r2_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))
# 绘图
x = np.arange(len(y_test))
plt.style.use('ggplot')
plt.scatter(x, y_test, s=10, label="实际数据点", )
plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
plt.xlabel("样本")
plt.ylabel("value")
plt.grid()
plt.legend()
plt.show()
if __name__ == '__main__':
trainDataPath = r'./data/剔除异常值.csv'
modelSavePath = './models'
y_label = 'chl'
X, Y, labels = loadData(trainDataPath, 'chl')
x_train, x_test, y_train, y_test = splitData(X, Y)
Model(x_train, x_test, y_train, y_test, modelSavePath)
随机森林回归算法:
import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False
def loadData(file, y_label):
df = pd.read_csv(file)
col_name = list(df.columns) # 获取所有列名
x_col = col_name
col_drop = [y_label]
for i in col_drop:
x_col.remove(i)
X = df[x_col].values
# 取得标签
Y = df[y_label].values
# sca = MinMaxScaler()
# Y = sca.fit_transform(Y.reshape(1, -1))
# print(Y)
labels = df[x_col].columns
print('自变量:', x_col)
return X, Y, labels
def splitData(X, Y):
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
return x_train, x_test, y_train, y_test
def rfModel(method, x_train, x_test, y_train, y_test, modelPath):
if method == 'rf':
# 建立随机森林回归模型
model = RandomForestRegressor(n_estimators=1000, random_state=1, n_jobs=-1)
model.fit(x_train, y_train)
modelPath = modelPath + '/' + 'model_rf.pkl'
with open(modelPath, 'wb') as f:
pickle.dump(model, f)
elif method == 'svr':
model = SVR(kernel='rbf', C=0.25)
model.fit(x_train, y_train)
modelPath = modelPath + '/' + 'model_svr.pkl'
with open(modelPath, 'wb') as f:
pickle.dump(model, f)
a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
b = np.sum((y_test - np.average(y_test)) ** 2)
c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
d = np.sum((y_train - np.average(y_train)) ** 2)
r2_train = c / d
r2_test = a / b
print("**************************RF Regressor Model***************************")
print("*****************************训练集精度*********************************")
print('R2:', r2_train)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))
print("*****************************测试集精度*********************************")
print('R2:', r2_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))
# 绘图
x = np.arange(len(y_test))
plt.style.use('ggplot')
plt.scatter(x, y_test, s=10, label="实际数据点", )
plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
plt.xlabel("样本")
plt.ylabel("value")
plt.grid()
plt.legend()
plt.show()
# return rf
if __name__ == '__main__':
trainDataPath = r'./data/剔除异常值.csv'
modelSavePath = './models'
y_label = 'chl'
X, Y, labels = loadData(trainDataPath, 'chl')
x_train, x_test, y_train, y_test = splitData(X, Y)
rfModel('rf', x_train, x_test, y_train, y_test, modelSavePath)
xgboost回归算法:
import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import plot_importance
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False
def loadData(file, y_label):
df = pd.read_csv(file)
col_name = list(df.columns) # 获取所有列名
x_col = col_name
col_drop = [y_label]
for i in col_drop:
x_col.remove(i)
X = df[x_col].values
# 取得标签
Y = df[y_label].values
labels = df[x_col].columns
print('自变量:', x_col)
return X, Y, labels
def splitData(X, Y):
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
return x_train, x_test, y_train, y_test
def Model(x_train, x_test, y_train, y_test, modelPath):
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.15, n_estimators=200, silent=True, objective='reg:gamma')
model.fit(x_train, y_train)
plot_importance(model)
plt.show()
modelPath = modelPath + '/' + 'model_xgboost.pkl'
with open(modelPath, 'wb') as f:
pickle.dump(model, f)
a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
b = np.sum((y_test - np.average(y_test)) ** 2)
c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
d = np.sum((y_train - np.average(y_train)) ** 2)
r2_train = c / d
r2_test = a / b
print("**************************xgb Regressor Model***************************")
print("*****************************训练集精度*********************************")
print('R2:', r2_train)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))
print("*****************************测试集精度*********************************")
print('R2:', r2_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))
# 绘图
x = np.arange(len(y_test))
plt.style.use('ggplot')
plt.scatter(x, y_test, s=10, label="实际数据点", )
plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
plt.xlabel("样本")
plt.ylabel("value")
plt.grid()
plt.legend()
plt.show()
# return rf
if __name__ == '__main__':
trainDataPath = r'./data/剔除异常值.csv'
modelSavePath = './models'
y_label = 'chl'
X, Y, labels = loadData(trainDataPath, 'chl')
x_train, x_test, y_train, y_test = splitData(X, Y)
Model(x_train, x_test, y_train, y_test, modelSavePath)
调用模型预测程序:
import numpy as np
from osgeo import gdal
import pickle
# 读取tif数据集
def readTif(fileName):
Dataset = gdal.Open(fileName)
if Dataset is None:
print(fileName + "文件无法打开")
return Dataset
# 保存tif文件函数
def writeTiff(im_data, im_geotrans, im_proj, path):
if 'int8' in im_data.dtype.name:
datatype = gdal.GDT_Byte
elif 'int16' in im_data.dtype.name:
datatype = gdal.GDT_UInt16
else:
datatype = gdal.GDT_Float32
if len(im_data.shape) == 3:
im_bands, im_height, im_width = im_data.shape
elif len(im_data.shape) == 2:
im_data = np.array([im_data])
im_bands, im_height, im_width = im_data.shape
# 创建文件
driver = gdal.GetDriverByName("GTiff")
Dataset = driver.Create(path, int(im_width), int(im_height), int(im_bands), datatype)
if Dataset is not None:
Dataset.SetGeoTransform(im_geotrans) # 写入仿射变换参数
Dataset.SetProjection(im_proj) # 写入投影
for i in range(im_bands):
Dataset.GetRasterBand(i + 1).WriteArray(im_data[i])
del Dataset
if __name__ == '__main__':
RFpath = r"./models/model_rf.pkl"
img_Path = r"./data/mianhua4boduan.tif"
SavePath = r"./result/mianhua4boduan.tif"
dataset = readTif(img_Path)
Tif_width = dataset.RasterXSize # 栅格矩阵的列数
Tif_height = dataset.RasterYSize # 栅格矩阵的行数
Tif_geotrans = dataset.GetGeoTransform() # 获取仿射矩阵信息
Tif_proj = dataset.GetProjection() # 获取投影信息
img_data = dataset.ReadAsArray(0, 0, Tif_width, Tif_height)
# 调用保存好的模型
# 以读二进制的方式打开文件
file = open(RFpath, "rb")
# 把模型从文件中读取出来
rf_model = pickle.load(file)
# 关闭文件
file.close()
# 用读入的模型进行预测
# 在与测试前要调整一下数据的格式
data = np.zeros((img_data.shape[0] - 1, img_data.shape[1] * img_data.shape[2]))
# data[0] = img_data[1].flatten()
# data[1] = img_data[2].flatten()
# data[2] = img_data[3].flatten()
for i in range(img_data.shape[0]):
data[i] = img_data[i].flatten()
data = np.nan_to_num(data)
data = data.swapaxes(0, 1)
# 对调整好格式的数据进行预测
# print(data)
prediction = rf_model.predict(data)
# 同样地,我们对预测好的数据调整为我们图像的格式
prediction = prediction.reshape(img_data.shape[1], img_data.shape[2])
prediction = prediction.astype(np.float32)
# 将结果写到tif图像里
writeTiff(prediction, Tif_geotrans, Tif_proj, SavePath)