数据来自California Coastal Regional Ocean Modeling System project [ROMSDatasetTempField]。本实验尝试用经纬度应用不同数量的Gaussian basis functions预测温度。
import numpy as np
import scipy.io as scio
import sklearn.preprocessing as sklp
import math
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# lambda of ridge regression
regularizer = 0.01
# temperature data of east coast of USA
dataSet = scio.loadmat("field.mat")
# take a part of data from original dataset as training data or testing data
def generateData(offset,dataSet,scaler):
# get all latitude
latScale = dataSet['LatScale']
# get all longitude
LonScale = dataSet['LonScale']
latSubSet = []
lonSubSet = []
# take one set of data every 7 original data
for i in range (0 + offset,175 + offset,7):
latSubSet.append([i,latScale[i][0]])
lonSubSet.append([i,LonScale[i][0]])
tempField = dataSet['TempField']
dataY = []
dataX = []
# separate the position and temperature
for lat in latSubSet:
for lon in lonSubSet:
temperature = tempField[lat[0]][lon[0]][0]
if np.isnan(temperature):
temperature = 0
if np.isnan(lat[1]):
lat[1] = 0
if np.isnan(lon[1]):
lon[1] = 0
dataX.append([lat[1],lon[1]])
dataY.append(temperature)
# normlize data and use same mean
dataX = np.array(dataX)
if scaler == "":
scaler = sklp.StandardScaler()
scaler.fit(dataX)
dataX = scaler.transform(dataX)
b = np.ones(dataX.shape[0])
dataY = np.array(dataY)
# variance to normlize another aet of data
return dataX,dataY,scaler
# ridge regression: train and predict
def ridgeReg(trainingDataX,trainingDataY):
# weight = (phi.T * phi)^-1 * phi * y
weight = np.matmul(np.matmul(np.linalg.inv(np.matmul(trainingDataX.T , trainingDataX) - regularizer * np.eye(trainingDataX.shape[1])) , trainingDataX.T), trainingDataY)
return weight
# gaussian basis function
def gaussianBasisFunc(mu,s,data):
return math.exp(-0.5*np.matmul(np.matmul((data-mu),np.linalg.inv(s)),(data-mu).T))
# mean squared error
def calerror(dataX, dataY,predictedDataY):
error = 0
for i in range(0,dataX.shape[0]):
error += np.power(np.linalg.norm(predictedDataY[i] - dataY[i]), 2)
error /= float(dataX.shape[0])
return error
# mapping function. map all data to feature
def calculatPhi(mus,sigma,DataX,):
# calculate Phi
# calculate gbf of each mu with all data
phi = []
for i in range(len(mus)):
mu = mus[i, :]
phiDatas = []
for data in DataX:
phiData = gaussianBasisFunc(mu, sigma, data)
phiDatas.append(phiData)
phi.append(phiDatas)
phi = np.array(phi)
phi = phi.T
return phi
# one train and test trail
def oneTrail(gaussianNum,dataSet,makeFig):
# generate training data while keep scaler
trainingDataX,trainingDataY,scaler = generateData(0,dataSet,"")
# generate testing data while using previous scaler
testingDataX,testingDataY,scaler = generateData(1,dataSet,scaler)
# set mus uniform in the data space
mus = []
for i in range(gaussianNum+1):
for j in range(gaussianNum+1):
# in this case data space is a square
mu = [-1.5 + i*(3/gaussianNum),-1.5 + j*(3/gaussianNum)]
mus.append(mu)
mus = np.array(mus)
# calculate width of gbf
h = mus[2,1]-mus[1,1]
# it can be assumed that one sigma for all gaussian basis function
sigma = np.diag(np.array(0.5 * np.array([h,h])))
# mapping data
phi = calculatPhi(mus,sigma,trainingDataX)
# train the model with ridge regression
weight = ridgeReg(phi,trainingDataY)
# calculate predicted temperature with training data
ridgeFitDataY = np.matmul(phi,weight)
# and compare the predicted temperature with the true temperature calculate training error
trainErr = calerror(trainingDataX,trainingDataY,ridgeFitDataY)
# calculate the predicted temperature with the test temperature
#mapping testing data to feature space
ridgePredictedDataY = np.matmul(calculatPhi(mus,sigma,testingDataX),weight)
# and commpare the predicted temperature with the true temperature calculate teating error
testErr = calerror(testingDataX,testingDataY,ridgePredictedDataY)
# plot two figure(original data and predicted data)
# for multiple trail set makefig = 0, trail without plot these figure
if makeFig:
fig = plt.figure(figsize = (9,4))
ax = fig.add_subplot(121, projection='3d') # get Axes3D object for subplot
ax.scatter(testingDataX[:,0],testingDataX[:,1],testingDataY)
ax.set_title("original testing data")
ax.set_xlabel('Latitude',fontsize=12, labelpad=4.2)
ax.set_ylabel('Longitude',fontsize=12, labelpad=4.2)
ax.set_zlabel('Temperature',fontsize=12)
ax = fig.add_subplot(122, projection='3d') # get Axes3D object for subplot
ax.scatter(testingDataX[:,0],testingDataX[:,1],ridgePredictedDataY)
ax.set_title(str(gaussianNum) + " gaussian basis functions testing data prediction")
ax.set_xlabel('Latitude',fontsize=12, labelpad=4.2)
ax.set_ylabel('Longitude',fontsize=12, labelpad=4.2)
ax.set_zlabel('Temperature',fontsize=12)
plt.show()
# trainErr,testErr for further processing
return trainErr,testErr
# plot 3d result
trainErr,testErr = oneTrail(15,dataSet,1)
# plot error with number of gaussian basis function in use
gaussianNum = [2,5,10,15,20]
trainErrs = []
testErrs = []
for num in gaussianNum:
# do not plot 3d result
trainErr,testErr = oneTrail(num,dataSet,0)
trainErrs.append(trainErr)
testErrs.append(testErr)
print("number of gbfs:[",num,"], Training Error:[",trainErr,"], Testing Error:[",testErr,"];")
plt.plot(gaussianNum,trainErrs)
plt.plot(gaussianNum,testErrs)
plt.xlabel('number of gaussian basis function')
plt.ylabel('error')
plt.show()