当我们的数据特征比样本点还多怎么办,是否能够预测呢答案是否定。
那么如何解决这个问题呢?科学家们引入了岭回归这个概念,岭回归其实就是如下:
与前面的算法相比。这里通过预测误差的最小化得到系数首先抽取一部分用于训练I的系数,剩下的再来训练W
def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx +eye(shape(xMat)[1])*lam
if linalg.det(denom)==0.0:
print "this matrix is singular"
return
ws = denom.I*(xMat.T*yMat)
return ws
def ridgeTest(xArr,yArr):
xMat = mat(xArr);yMat = mat(yArr).T
yMean = mean(yMat,0)
xMeans = mean(xMat,0)
xVar = var(xMat,0)
xMat = (xMat-xMeans)/xVar
numTestPtS = 30
wMat = zeros((numTestPtS,shape(xMat)[1]))
for i in range(numTestPtS):#lamda is different
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:] = ws.T
return wMat
而代码主要改变的是Lambda 用30个不同的来进行实验,获得一个权重矩阵,每一行代表不同的Lambda所对应的权重
reload(regression)
abX,abY = regression.loadDataSet('abalone.txt')
ridgeWeights = regression.ridgeTest(abX,abY)
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.show()
我们可以看到最左边的时候Lambda最小的时候系数和原来的线性回归的系数是一样的,当增大的过程中,系数会慢慢趋近于0.所以需要在中间部分找到最佳的
作为参数变量。
通过增加约数最小二乘可以得到和岭回归一样的公式:
虽然约束条件只是稍微变换但是约束形式确实大相径庭,在这个Lambda足够小的时候,可以帮助我们更好理解数据
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
return ((yArr-yHatArr)**2).sum()
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0) #calc mean then subtract it off
inVar = var(inMat,0) #calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
return inMat
def stageWise(xArr,yArr,eps=0.01,numIt=100):
xMat = mat(xArr);yMat = mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean
xMat = regularize(xMat)
m,n = shape(xMat)
returnMat = zeros((numIt,n))
ws = zeros((n,1));wsTest = ws.copy();wsMax = ws.copy();
for i in range(numIt):
print ws.T
lowestError = inf;
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy();
wsTest[j]+=eps*sign;
yTest = xMat*wsTest;
rssE = rssError(yMat.A,yTest.A)
if rssE
reload(regression) xArr,yArr = regression.loadDataSet('abalone.txt') regression.stageWise(xArr,yArr,0.01,200)
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
sleep(10)
myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
searchURL = 'file:///E:/pythonProject/ML/LinearRegrossion/setHtml/?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
pg = urllib2.urlopen(searchURL)
retDict = json.loads(pg.read())
for i in range(len(retDict['items'])):
try:
currItem = retDict['items'][i]
if currItem['product']['condition'] == 'new':
newFlag = 1
else: newFlag = 0
listOfInv = currItem['product']['inventories']
for item in listOfInv:
sellingPrice = item['price']
if sellingPrice > origPrc * 0.5:
print "%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice)
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)
except: print 'problem with item %d' % i
def setDataCollect(retX, retY):
searchForSet(retX, retY, 8288, 2006, 800, 49.99)
searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
def crossValidation(xArr,yArr,numVal=10):
m = len(yArr)
indexList = range(m)
errorMat = zeros((numVal,30))
for i in range(numVal):
trainX=[];trainY=[]
testX =[];testY=[]
random.shuffle(indexList)
for j in range(m):
if j