线性回归模型有:一般形式的一元线性回归和多元线性回归,使用L2范数的岭回归(Ridge),使用L1范数的套索回归(Lasso),使用L1和L2范数的ElasticNet回归(是对Lasso回归和岭回归的融合),逻辑回归。
from sklearn.linear_model import LinearRegression
LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=1)
参数含义:
属性
方法:
from sklearn.linear_model import Ridge
Ridge(alpha=1.0, fit_intercept=True, normalize=False,copy_X=True, max_iter=None,
tol=1e-3, solver=“auto”,random_state=None)
参数含义:
alpha:正则项系数,值越大正则项占比越大。初始值建议一开始设置为0,这样先确定一个比较好的学习 率,学习率一旦确定,给alpha一个较小的值,然后根据验证集上的准确率,增大或减小10倍。10倍是粗调节,当确定了合适的数量级后,再在同一个数量级内细调节。
fit_intercept:布尔值,指定是否需要计算截距b值。False则不计算b值。
normalize:布尔值。如果等于True,模型训练之前会把数据归一化。 这里归一化有两个好处: (1):提升模型的收敛速度,减少寻找最优解的时间。 (2)提升模型的精度。
copy_X:布尔值。如果设置为True,则会复制一份训练数据。
max_iter:整数。指定了最大迭代次数。如果为None,则采用默认值。
tol:阈值。判断迭代是否收敛或者是否满足精度的要求。
solver:字符串。指定求解最优化问题的算法。
(1).solver=‘auto’,根据数据集自动选择算法。
(2).solver=‘svd’,采用奇异值分解的方法来计算
(3).solver=‘cholesky’,采用scipy.linalg.solve函数求解最优解。
(4).solver=‘sparse_cg’,采用scipy.sparse.linalg.cg函数来求取最优解。
(5).solver=‘sag’,采用Stochastic Average Gradient descent算法求解最优化问题。
random_state:一个整数或者一个RandomState实例,或者为None。它在solver="sag"时使用。
(1).如果为整数,则它指定了随机数生成器的种子。
(2).如果为RandomState实例,则指定了随机数生成器。
(3).如果为None,则使用默认的随机数生成器。
属性:
方法:
from sklearn.linear_model import Lasso
Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False,
copy_X=True, max_iter=1000,
tol=1e-4, warm_start=False, positive=False,random_state=None,
selection=‘cyclic’)
参数含义:
属性:
方法:
正则化项为: a l p h a ∗ l 1 r a t i o ∗ ∣ ∣ w ∣ ∣ 1 + 0.5 ∗ a l p h a ∗ ( 1 − l 1 r a t i o ) ∗ ∣ ∣ w ∣ ∣ 2 2 alpha * l1_ratio * ||w||_1 + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2 alpha∗l1ratio∗∣∣w∣∣1+0.5∗alpha∗(1−l1ratio)∗∣∣w∣∣22
from sklearn.linear_model import ElasticNet
ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True,
normalize=False,precompute=False, max_iter=1000,copy_X=True, tol=1e-4,
warm_start=False, positive=False,random_state=None, selection=‘cyclic’)
参数含义:
属性:
方法:
计算公式为: s c o r e = ( 1 − u / v ) score=(1 - u/v) score=(1−u/v)
其中 u = ( ( y t r u e − y p r e d ) ∗ ∗ 2 ) . s u m ( ) , v = ( ( y t r u e − y t r u e . m e a n ( ) ) ∗ ∗ 2 ) . s u m ( ) u=((y_true - y_pred) ** 2).sum(),v=((y_true - y_true.mean()) ** 2).sum() u=((ytrue−ypred)∗∗2).sum(),v=((ytrue−ytrue.mean())∗∗2).sum()
score最大值是1,但有可能是负值(预测效果太差)。score越大,预测性能越好。
from sklearn.linear_model import LogisticRegression
LogisticRegression(penalty=‘l2’,dual=False,tol=1e-4,C=1.0,
fit_intercept=True,intercept_scaling=1,class_weight=None,
random_state=None,solver=‘liblinear’,max_iter=100,
multi_class=‘ovr’,verbose=0,warm_start=False,n_jobs=1)
参数含义:
属性:
方法:
多元线性回归
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets,linear_model,model_selection
from sklearn.model_selection import train_test_split
def load_data():
diabetes = datasets.load_diabetes()
return train_test_split(diabetes.data,diabetes.target,test_size=0.2)
def test_LineearRegression(*data):
X_train,X_test,y_train,y_test = data
regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
print('Coefficients:{},intercept:{}'.format(regr.coef_,regr.intercept_))
print('Residual sum of squares:{}'.format(np.mean(regr.predict(X_test)-y_test)**2))
print('Scores:{}'.format(regr.score(X_test,y_test)))
if __name__ == '__main__':
X_train,X_test,y_train,y_test = load_data()
test_LineearRegression(X_train,X_test,y_train,y_test)
Lasso回归
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets,linear_model
from sklearn.model_selection import train_test_split
def load_data():
diabetes = datasets.load_diabetes()
return train_test_split(diabetes.data,diabetes.target,test_size=0.2)
def test_Lasso(*data):
X_train,X_test,y_train,y_test = data
regr = linear_model.Lasso()
regr.fit(X_train,y_train)
print('Coefficients:{},intercept:{}'.format(regr.coef_,regr.intercept_))
print('Residual sum of squares:{}'.format(np.mean(regr.predict(X_test)-y_test)**2))
print('Scores:{}'.format(regr.score(X_test,y_test)))
def test_Lasso_alpha(*data):
X_train,X_test,y_train,y_test = data
alphas = [0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000]
scores = []
for i,alpha in enumerate(alphas):
regr = linear_model.Lasso(alpha=alpha)
regr.fit(X_train,y_train)
scores.append(regr.score(X_test,y_test))
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(alphas,scores)
ax.set_xlabel("alpha")
ax.set_ylabel("score")
ax.set_xscale("log")
ax.set_title('Lasso')
plt.show()
if __name__ == '__main__':
X_train,X_test,y_train,y_test = load_data()
test_Lasso(X_train,X_test,y_train,y_test)
test_Lasso_alpha(X_train,X_test,y_train,y_test)
岭回归(Ridge)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets,linear_model
from sklearn.model_selection import train_test_split
def load_data():
diabetes = datasets.load_diabetes()
return train_test_split(diabetes.data,diabetes.target,test_size=0.2,random_state=0)
def test_Ridge(*data):
X_train,X_test,y_train,y_test = data
regr = linear_model.Ridge()
regr.fit(X_train,y_train)
print('Coefficients:{},intercept:{}'.format(regr.coef_,regr.intercept_))
print('Residual sum of squares:{}'.format(np.mean(regr.predict(X_test)-y_test)**2))
print('Scores:{}'.format(regr.score(X_test,y_test)))
def test_Ridge_alpha(*data):
X_train,X_test,y_train,y_test = data
alphas = [0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000]
scores = []
for i,alpha in enumerate(alphas):
regr = linear_model.Lasso(alpha=alpha)
regr.fit(X_train,y_train)
scores.append(regr.score(X_test,y_test))
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(alphas,scores)
ax.set_xlabel("alpha")
ax.set_ylabel("score")
ax.set_xscale("log")
ax.set_title('Ridge')
plt.show()
if __name__ == '__main__':
X_train,X_test,y_train,y_test = load_data()
test_Ridge(X_train,X_test,y_train,y_test)
test_Ridge_alpha(X_train,X_test,y_train,y_test)
逻辑回归
from math import exp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['label'] = iris.target
df.columns =['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100,[0,1,-1]])
return data[:,:2],data[:,-1]
X,y = create_data()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
class LogisticRegressionClassifier(): #自定义方式
def __init__(self,max_iter=700,learning_rate=0.01):
self.max_iter = max_iter
self.learning_rate = learning_rate
def sigmoid(self,x):
return x / (1+exp(-1))
def data_matrix(self,X):
data_mat = []
for d in X:
data_mat.append([1.0,*d])
return data_mat
def fit(self, X, y):
data_mat = self.data_matrix(X)
self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
for iter_ in range(self.max_iter):
for i in range(len(X)):
result = self.sigmoid(np.dot(data_mat[i], self.weights))
error = y[i] - result
self.weights += self.learning_rate * error * np.transpose([data_mat[i]])
print('LogisticRegression Model(learning_rate={}, max_iter={})'.
format(self.learning_rate, self.max_iter))
def score(self, X_test, y_test):
right = 0
X_test = self.data_matrix(X_test)
for x, y in zip(X_test, y_test):
result = np.dot(x, self.weights)
if (result > 0 and y == 1) or (result < 0 and y == 0):
right += 1
return right / len(X_test)
lr_clf = LogisticRegressionClassifier()
lr_clf.fit(X_train, y_train)
lr_clf.score(X_test, y_test)
x_points = np.arange(4, 8)
y_ = -(lr_clf.weights[1] * x_points + lr_clf.weights[0]) / lr_clf.weights[2]
plt.plot(x_points, y_)
plt.scatter(X[:50, 0], X[:50, 1], label='0')
plt.scatter(X[50:, 0], X[50:, 1], label='1')
plt.legend()
#调用方式
Logitic_model = linear_model.LogisticRegression()
Logitic_model.fit(X_train,y_train)
Logitic_model.score(X_test,y_test)