[Python] 线性回归应用Multivariate Gaussian basis functions笔记

数据来自California Coastal Regional Ocean Modeling System project [ROMSDatasetTempField]。本实验尝试用经纬度应用不同数量的Gaussian basis functions预测温度。
[Python] 线性回归应用Multivariate Gaussian basis functions笔记_第1张图片

import numpy as np
import scipy.io as scio
import sklearn.preprocessing as sklp
import math
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# lambda of ridge regression
regularizer = 0.01
# temperature data of east coast of USA
dataSet = scio.loadmat("field.mat")

# take a part of data from original dataset as training data or testing data
def generateData(offset,dataSet,scaler):
    # get all latitude
    latScale = dataSet['LatScale']
    # get all longitude
    LonScale = dataSet['LonScale']
    latSubSet = []
    lonSubSet = []
    # take one set of data every 7 original data
    for i in range (0 + offset,175 + offset,7):
        latSubSet.append([i,latScale[i][0]])
        lonSubSet.append([i,LonScale[i][0]])
    tempField = dataSet['TempField']

    dataY = []
    dataX = []
    # separate the position and temperature
    for lat in latSubSet:
        for lon in lonSubSet:
            temperature = tempField[lat[0]][lon[0]][0]
            if np.isnan(temperature):
                temperature = 0
            if np.isnan(lat[1]):
                lat[1] = 0
            if np.isnan(lon[1]):
                lon[1] = 0
            dataX.append([lat[1],lon[1]])
            dataY.append(temperature)

    # normlize data and use same mean
    dataX = np.array(dataX)
    if scaler == "":
        scaler = sklp.StandardScaler()
        scaler.fit(dataX)
    dataX = scaler.transform(dataX)
    b = np.ones(dataX.shape[0])
    dataY = np.array(dataY)
    # variance to normlize another aet of data
    return dataX,dataY,scaler

# ridge regression: train and predict
def ridgeReg(trainingDataX,trainingDataY):
    # weight  = (phi.T * phi)^-1 * phi * y
    weight = np.matmul(np.matmul(np.linalg.inv(np.matmul(trainingDataX.T , trainingDataX) - regularizer * np.eye(trainingDataX.shape[1])) , trainingDataX.T), trainingDataY)
    return weight

# gaussian basis function
def gaussianBasisFunc(mu,s,data):
    return math.exp(-0.5*np.matmul(np.matmul((data-mu),np.linalg.inv(s)),(data-mu).T))

# mean squared error
def calerror(dataX, dataY,predictedDataY):
    error = 0
    for i in range(0,dataX.shape[0]):
        error += np.power(np.linalg.norm(predictedDataY[i] - dataY[i]), 2)
    error /= float(dataX.shape[0])
    return error

# mapping function. map all data to feature
def calculatPhi(mus,sigma,DataX,):
    # calculate Phi
    # calculate gbf of each mu with all data
    phi = []
    for i in range(len(mus)):
        mu = mus[i, :]
        phiDatas = []
        for data in DataX:
            phiData = gaussianBasisFunc(mu, sigma, data)
            phiDatas.append(phiData)
        phi.append(phiDatas)
    phi = np.array(phi)
    phi = phi.T
    return phi

# one train and test trail
def oneTrail(gaussianNum,dataSet,makeFig):
    # generate training data while keep scaler
    trainingDataX,trainingDataY,scaler = generateData(0,dataSet,"")
    # generate testing data while using previous scaler
    testingDataX,testingDataY,scaler = generateData(1,dataSet,scaler)

    # set mus uniform in the data space
    mus = []
    for i in range(gaussianNum+1):
        for j in range(gaussianNum+1):
            # in this case data  space is a square
            mu = [-1.5 + i*(3/gaussianNum),-1.5 + j*(3/gaussianNum)]
            mus.append(mu)
    mus = np.array(mus)
    # calculate width of gbf
    h = mus[2,1]-mus[1,1]
    # it can be assumed that one sigma for all gaussian basis function
    sigma = np.diag(np.array(0.5 * np.array([h,h])))
    # mapping data
    phi = calculatPhi(mus,sigma,trainingDataX)
    # train the model with ridge regression
    weight = ridgeReg(phi,trainingDataY)
    # calculate predicted temperature with training data
    ridgeFitDataY = np.matmul(phi,weight)
    # and compare the predicted temperature with the true temperature calculate training error
    trainErr = calerror(trainingDataX,trainingDataY,ridgeFitDataY)
    # calculate the predicted temperature with the test temperature
    #mapping testing data to feature space
    ridgePredictedDataY = np.matmul(calculatPhi(mus,sigma,testingDataX),weight)
    # and commpare the predicted temperature with the true temperature calculate teating error
    testErr = calerror(testingDataX,testingDataY,ridgePredictedDataY)
    # plot two figure(original data and predicted data)
    # for multiple trail set makefig = 0, trail without plot these figure
    if makeFig:
        fig = plt.figure(figsize = (9,4))
        ax = fig.add_subplot(121, projection='3d') # get Axes3D object for subplot
        ax.scatter(testingDataX[:,0],testingDataX[:,1],testingDataY)
        ax.set_title("original testing data")
        ax.set_xlabel('Latitude',fontsize=12, labelpad=4.2)
        ax.set_ylabel('Longitude',fontsize=12, labelpad=4.2)
        ax.set_zlabel('Temperature',fontsize=12)

        ax = fig.add_subplot(122, projection='3d') # get Axes3D object for subplot
        ax.scatter(testingDataX[:,0],testingDataX[:,1],ridgePredictedDataY)
        ax.set_title(str(gaussianNum) + " gaussian basis functions testing data prediction")
        ax.set_xlabel('Latitude',fontsize=12, labelpad=4.2)
        ax.set_ylabel('Longitude',fontsize=12, labelpad=4.2)
        ax.set_zlabel('Temperature',fontsize=12)
        plt.show()
    # trainErr,testErr for further processing
    return trainErr,testErr

# plot 3d result
trainErr,testErr = oneTrail(15,dataSet,1)

# plot error with number of gaussian basis function in use
gaussianNum = [2,5,10,15,20]
trainErrs = []
testErrs = []
for num in gaussianNum:
    # do not plot 3d result
    trainErr,testErr = oneTrail(num,dataSet,0)
    trainErrs.append(trainErr)
    testErrs.append(testErr)
    print("number of gbfs:[",num,"], Training Error:[",trainErr,"], Testing Error:[",testErr,"];")

plt.plot(gaussianNum,trainErrs)
plt.plot(gaussianNum,testErrs)
plt.xlabel('number of gaussian basis function')
plt.ylabel('error')
plt.show()

你可能感兴趣的:([Python] 线性回归应用Multivariate Gaussian basis functions笔记)