python 机器学习回归模型-决策树-多项式-随机森林-KNN-xgboost 遥感影像-回归模型算法

以下算法所需训练数据格式:

python 机器学习回归模型-决策树-多项式-随机森林-KNN-xgboost 遥感影像-回归模型算法_第1张图片

决策树回归算法:

import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False


def loadData(file, y_label):
    df = pd.read_csv(file)
    col_name = list(df.columns)  # 获取所有列名
    x_col = col_name
    col_drop = [y_label]
    for i in col_drop:
        x_col.remove(i)
    X = df[x_col].values
    # 取得标签
    Y = df[y_label].values
    labels = df[x_col].columns
    print('自变量:', x_col)
    return X, Y, labels


def splitData(X, Y, test_size=0.3):
    row = X.shape[0]
    x_train = X[0:round(row * (1 - test_size))]
    y_train = Y[0:round(row * (1 - test_size))]
    x_test = X[round(row * (1 - test_size)):]
    y_test = Y[round(row * (1 - test_size)):]
    # x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    return x_train, x_test, y_train, y_test


def plot_scatter(x, y):
    # 解决坐标轴刻度负号乱码
    plt.rcParams['axes.unicode_minus'] = False
    # 解决中文乱码问题
    plt.rcParams['font.sans-serif'] = ['Simhei']

    plt.scatter(x, y)
    plt.show()

    # 开始用np.polyfit 函数来拟合成一条3次曲线
    para = np.polyfit(x, y, 3)

    # 输出结果为3次方程的参数
    y2 = para[0] * x ** 3 + para[1] * x ** 2 + para[2] * x + para[3]

    # 将拟合后的结果打印
    plt.scatter(x, y)
    plt.plot(x, y2, 'r')
    plt.show()


def Model(x_train, x_test, y_train, y_test, modelPath):
    model = DecisionTreeRegressor(random_state=0)
    model.fit(x_train, y_train)
    modelPath = modelPath + '/' + 'model_DT.pkl'
    with open(modelPath, 'wb') as f:
        pickle.dump(model, f)

    a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
    b = np.sum((y_test - np.average(y_test)) ** 2)

    c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
    d = np.sum((y_train - np.average(y_train)) ** 2)

    r2_train = c / d
    r2_test = a / b
    print("*********************Decision Tree Regressor Model*********************")
    print("******************************训练集精度*********************************")
    print('R2:', r2_train)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))

    print("*****************************测试集精度*********************************")
    print('R2:', r2_test)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))

    # 绘图
    plot_scatter(y_test, model.predict(x_test))
    # x = np.arange(len(y_test))
    # plt.style.use('ggplot')
    # plt.scatter(x, y_test, s=10, label="实际数据点", )
    # plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
    # plt.xlabel("样本")
    # plt.ylabel("value")
    # plt.grid()
    # plt.legend()
    # plt.show()


if __name__ == '__main__':
    trainDataPath = r'./data/剔除异常值.csv'
    modelSavePath = './models'
    y_label = 'chl'
    X, Y, labels = loadData(trainDataPath, 'chl')
    x_train, x_test, y_train, y_test = splitData(X, Y)
    Model(x_train, x_test, y_train, y_test, modelSavePath)

多项式回归算法:

import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import sklearn.pipeline as pl
import sklearn.preprocessing as sp
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
import os

os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'
plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False


def loadData(file, y_label):
    df = pd.read_csv(file)
    col_name = list(df.columns)  # 获取所有列名
    x_col = col_name
    col_drop = [y_label]
    for i in col_drop:
        x_col.remove(i)
    X = df[x_col].values
    # 取得标签
    Y = df[y_label].values

    # sca = MinMaxScaler()
    # Y = sca.fit_transform(Y.reshape(1, -1))
    # print(Y)
    labels = df[x_col].columns
    print('自变量:', x_col)
    return X, Y, labels


def splitData(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    return x_train, x_test, y_train, y_test


def Model(degree, x_train, x_test, y_train, y_test, modelPath):
    model = pl.make_pipeline(
        sp.PolynomialFeatures(degree),  # 多项式特征扩展器
        lm.LinearRegression()  # 线性回归器
    )
    model.fit(x_train, y_train)
    modelPath = modelPath + '/' + 'model_duoxiangshi.pkl'
    with open(modelPath, 'wb') as f:
        pickle.dump(model, f)
    a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
    b = np.sum((y_test - np.average(y_test)) ** 2)

    c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
    d = np.sum((y_train - np.average(y_train)) ** 2)

    r2_train = c / d
    r2_test = a / b
    print("**************************duoxianshi Regressor Model***************************")
    print("*****************************训练集精度*********************************")
    print('R2:', r2_train)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))

    print("*****************************测试集精度*********************************")
    print('R2:', r2_test)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))

    # 绘图
    x = np.arange(len(y_test))
    plt.style.use('ggplot')
    plt.scatter(x, y_test, s=10, label="实际数据点", )
    plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
    plt.xlabel("样本")
    plt.ylabel("value")
    plt.grid()
    plt.legend()
    plt.show()
    # return rf


if __name__ == '__main__':
    trainDataPath = r'./data/剔除异常值.csv'
    modelSavePath = './models'
    # 多项式最高次数
    degree = 3

    y_label = 'chl'
    X, Y, labels = loadData(trainDataPath, 'chl')
    x_train, x_test, y_train, y_test = splitData(X, Y)
    Model(degree, x_train, x_test, y_train, y_test, modelSavePath)

KNN回归算法:

import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False


def loadData(file, y_label):
    df = pd.read_csv(file)
    col_name = list(df.columns)  # 获取所有列名
    x_col = col_name
    col_drop = [y_label]
    for i in col_drop:
        x_col.remove(i)
    X = df[x_col].values
    # 取得标签
    Y = df[y_label].values
    labels = df[x_col].columns
    print('自变量:', x_col)
    return X, Y, labels


def splitData(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    return x_train, x_test, y_train, y_test


def Model(x_train, x_test, y_train, y_test, modelPath):
    model = KNeighborsRegressor(n_neighbors=2)
    model.fit(x_train, y_train)
    modelPath = modelPath + '/' + 'model_knn.pkl'
    with open(modelPath, 'wb') as f:
        pickle.dump(model, f)

    a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
    b = np.sum((y_test - np.average(y_test)) ** 2)
    
    c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
    d = np.sum((y_train - np.average(y_train)) ** 2)
    
    r2_train = c / d
    r2_test = a / b
    print("************************KNN Regressor Model****************************")
    print("*****************************训练集精度*********************************")
    print('R2:', r2_train)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))
    
    print("*****************************测试集精度*********************************")
    print('R2:', r2_test)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))

    # 绘图
    x = np.arange(len(y_test))
    plt.style.use('ggplot')
    plt.scatter(x, y_test, s=10, label="实际数据点", )
    plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
    plt.xlabel("样本")
    plt.ylabel("value")
    plt.grid()
    plt.legend()
    plt.show()


if __name__ == '__main__':
    trainDataPath = r'./data/剔除异常值.csv'
    modelSavePath = './models'
    y_label = 'chl'
    X, Y, labels = loadData(trainDataPath, 'chl')
    x_train, x_test, y_train, y_test = splitData(X, Y)
    Model(x_train, x_test, y_train, y_test, modelSavePath)

随机森林回归算法:

import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import os

os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'
plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False


def loadData(file, y_label):
    df = pd.read_csv(file)
    col_name = list(df.columns)  # 获取所有列名
    x_col = col_name
    col_drop = [y_label]
    for i in col_drop:
        x_col.remove(i)
    X = df[x_col].values
    # 取得标签
    Y = df[y_label].values

    # sca = MinMaxScaler()
    # Y = sca.fit_transform(Y.reshape(1, -1))
    # print(Y)
    labels = df[x_col].columns
    print('自变量:', x_col)
    return X, Y, labels


def splitData(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    return x_train, x_test, y_train, y_test


def rfModel(method, x_train, x_test, y_train, y_test, modelPath):
    if method == 'rf':
        # 建立随机森林回归模型
        model = RandomForestRegressor(n_estimators=1000, random_state=1, n_jobs=-1)
        model.fit(x_train, y_train)
        modelPath = modelPath + '/' + 'model_rf.pkl'
        with open(modelPath, 'wb') as f:
            pickle.dump(model, f)
    elif method == 'svr':
        model = SVR(kernel='rbf', C=0.25)
        model.fit(x_train, y_train)
        modelPath = modelPath + '/' + 'model_svr.pkl'
        with open(modelPath, 'wb') as f:
            pickle.dump(model, f)
    a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
    b = np.sum((y_test - np.average(y_test)) ** 2)
    
    c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
    d = np.sum((y_train - np.average(y_train)) ** 2)
    
    r2_train = c / d
    r2_test = a / b
    print("**************************RF Regressor Model***************************")
    print("*****************************训练集精度*********************************")
    print('R2:', r2_train)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))
    
    print("*****************************测试集精度*********************************")
    print('R2:', r2_test)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))

    # 绘图
    x = np.arange(len(y_test))
    plt.style.use('ggplot')
    plt.scatter(x, y_test, s=10, label="实际数据点", )
    plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
    plt.xlabel("样本")
    plt.ylabel("value")
    plt.grid()
    plt.legend()
    plt.show()
    # return rf


if __name__ == '__main__':
    trainDataPath = r'./data/剔除异常值.csv'
    modelSavePath = './models'
    y_label = 'chl'
    X, Y, labels = loadData(trainDataPath, 'chl')
    x_train, x_test, y_train, y_test = splitData(X, Y)
    rfModel('rf', x_train, x_test, y_train, y_test, modelSavePath)

xgboost回归算法:

import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import plot_importance

plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False


def loadData(file, y_label):
    df = pd.read_csv(file)
    col_name = list(df.columns)  # 获取所有列名
    x_col = col_name
    col_drop = [y_label]
    for i in col_drop:
        x_col.remove(i)
    X = df[x_col].values
    # 取得标签
    Y = df[y_label].values

    labels = df[x_col].columns
    print('自变量:', x_col)
    return X, Y, labels


def splitData(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    return x_train, x_test, y_train, y_test


def Model(x_train, x_test, y_train, y_test, modelPath):
    model = xgb.XGBRegressor(max_depth=5, learning_rate=0.15, n_estimators=200, silent=True, objective='reg:gamma')
    model.fit(x_train, y_train)
    plot_importance(model)
    plt.show()
    modelPath = modelPath + '/' + 'model_xgboost.pkl'
    with open(modelPath, 'wb') as f:
        pickle.dump(model, f)
    a = np.sum((model.predict(x_test) - np.average(y_test)) ** 2)
    b = np.sum((y_test - np.average(y_test)) ** 2)
    
    c = np.sum((model.predict(x_train) - np.average(y_train)) ** 2)
    d = np.sum((y_train - np.average(y_train)) ** 2)
    
    r2_train = c / d
    r2_test = a / b
    print("**************************xgb Regressor Model***************************")
    print("*****************************训练集精度*********************************")
    print('R2:', r2_train)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_train, model.predict(x_train)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_train, model.predict(x_train)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_train, model.predict(x_train))))
    
    print("*****************************测试集精度*********************************")
    print('R2:', r2_test)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model.predict(x_test)))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, model.predict(x_test)))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model.predict(x_test))))

    # 绘图
    x = np.arange(len(y_test))
    plt.style.use('ggplot')
    plt.scatter(x, y_test, s=10, label="实际数据点", )
    plt.plot(x, model.predict(x_test), c='r', label="随机森林回归拟合数据")
    plt.xlabel("样本")
    plt.ylabel("value")
    plt.grid()
    plt.legend()
    plt.show()
    # return rf


if __name__ == '__main__':
    trainDataPath = r'./data/剔除异常值.csv'
    modelSavePath = './models'

    y_label = 'chl'
    X, Y, labels = loadData(trainDataPath, 'chl')
    x_train, x_test, y_train, y_test = splitData(X, Y)
    Model(x_train, x_test, y_train, y_test, modelSavePath)

调用模型预测程序:

import numpy as np
from osgeo import gdal
import pickle


# 读取tif数据集
def readTif(fileName):
    Dataset = gdal.Open(fileName)
    if Dataset is None:
        print(fileName + "文件无法打开")
    return Dataset


# 保存tif文件函数
def writeTiff(im_data, im_geotrans, im_proj, path):
    if 'int8' in im_data.dtype.name:
        datatype = gdal.GDT_Byte
    elif 'int16' in im_data.dtype.name:
        datatype = gdal.GDT_UInt16
    else:
        datatype = gdal.GDT_Float32
    if len(im_data.shape) == 3:
        im_bands, im_height, im_width = im_data.shape
    elif len(im_data.shape) == 2:
        im_data = np.array([im_data])
        im_bands, im_height, im_width = im_data.shape
    # 创建文件
    driver = gdal.GetDriverByName("GTiff")
    Dataset = driver.Create(path, int(im_width), int(im_height), int(im_bands), datatype)
    if Dataset is not None:
        Dataset.SetGeoTransform(im_geotrans)  # 写入仿射变换参数
        Dataset.SetProjection(im_proj)  # 写入投影
    for i in range(im_bands):
        Dataset.GetRasterBand(i + 1).WriteArray(im_data[i])
    del Dataset


if __name__ == '__main__':
    RFpath = r"./models/model_rf.pkl"
    img_Path = r"./data/mianhua4boduan.tif"
    SavePath = r"./result/mianhua4boduan.tif"

    dataset = readTif(img_Path)
    Tif_width = dataset.RasterXSize  # 栅格矩阵的列数
    Tif_height = dataset.RasterYSize  # 栅格矩阵的行数
    Tif_geotrans = dataset.GetGeoTransform()  # 获取仿射矩阵信息
    Tif_proj = dataset.GetProjection()  # 获取投影信息
    img_data = dataset.ReadAsArray(0, 0, Tif_width, Tif_height)

    # 调用保存好的模型
    # 以读二进制的方式打开文件
    file = open(RFpath, "rb")
    # 把模型从文件中读取出来
    rf_model = pickle.load(file)
    # 关闭文件
    file.close()
    # 用读入的模型进行预测
    #  在与测试前要调整一下数据的格式
    data = np.zeros((img_data.shape[0] - 1, img_data.shape[1] * img_data.shape[2]))
    # data[0] = img_data[1].flatten()
    # data[1] = img_data[2].flatten()
    # data[2] = img_data[3].flatten()
    for i in range(img_data.shape[0]):
        data[i] = img_data[i].flatten()
    data = np.nan_to_num(data)
    data = data.swapaxes(0, 1)
    #  对调整好格式的数据进行预测
    # print(data)
    prediction = rf_model.predict(data)
    #  同样地,我们对预测好的数据调整为我们图像的格式
    prediction = prediction.reshape(img_data.shape[1], img_data.shape[2])
    prediction = prediction.astype(np.float32)

    #  将结果写到tif图像里
    writeTiff(prediction, Tif_geotrans, Tif_proj, SavePath)

你可能感兴趣的:(python-机器学习,python,决策树,回归)