import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
dataset = pd.read_csv("Salary_Data.csv")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values
# 处理缺失数据
imputer = Imputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(x[:, 0])
x[:,0] = imputer.transform(x[:, 0])
"""# 用虚拟变量将文本数据x转化为三维数字数据(非必要)
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = ct.fit_transform(x)
# 用LabelEncoder方法将文本数据y转化为一维数字数据
labelencoder_y = LabelEncoder();
y = labelencoder_y.fit_transform(y)
"""
# 将数据划分为训练集和测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1/3, random_state=0)
"""# 数据的特征缩放(非必要)
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
"""
#对训练集进行简单线性回归
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)
#利用回归器预测测试集
y_pred=regressor.predict(x_test)
#结果可视化
plt.scatter(x_test,y_test,color="red")
plt.plot(x_train,regressor.predict(x_train),color="blue")
plt.title("Salary VS Experience(training set)")
plt.xlabel("Experience")
plt.ylabel("Salary")
plt.show()
简单线性回归只有一个自变量,多元线性回归有多个自变量,可表示为向量内积的形式
如多种类别的文本数据,需要将其转化为数字数据
虚拟变量的陷阱(过拟合):因为回归方程中含有常数项,所以若虚拟变量的种类有x种,只需要在最后的回归方程中加上x-1个自变量,否则会产生多重共线性的问题
注:python的标准库一般会自动处理这个问题
选择一种评价策略,n个自变量有 2 n − 1 2^n-1 2n−1种模型,计算量偏大
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
dataset = pd.read_csv("50_Startups.csv")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
# 处理缺失数据
imputer = Imputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(x[:, 0:3])
x[:,0:3] = imputer.transform(x[:,0:3 ])
# 用虚拟变量将文本数据x转化为三维数字数据(非必要)
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = ct.fit_transform(x)
#处理多重共线性
x=x[:,1:]
# 将数据划分为训练集和测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
"""# 数据的特征缩放(非必要)
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
"""
#对训练集进行简单线性回归
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)
#利用回归器预测测试集
y_pred=regressor.predict(x_test)
#运用反向淘汰算法剔除一些自变量
import statsmodels.api as sm
x_train=np.append(arr=np.ones((40,1)),values=x_train,axis=1)
x_opt=x_train[:,[0,1,2,3,4,5]].astype(float)#注意这里需要转化为float类型否则会报错
regressor_ols=sm.OLS(endog=y_train,exog=x_opt).fit()
regressor_ols.summary()
#之后根据p值持续淘汰自变量
x_opt=x_train[:,[0,1,3,4,5]].astype(float)#注意这里需要转化为float类型否则会报错
regressor_ols=sm.OLS(endog=y_train,exog=x_opt).fit()
regressor_ols.summary()
x_opt=x_train[:,[0,3,4,5]].astype(float)#注意这里需要转化为float类型否则会报错
regressor_ols=sm.OLS(endog=y_train,exog=x_opt).fit()
regressor_ols.summary()
x_opt=x_train[:,[0,3,5]].astype(float)#注意这里需要转化为float类型否则会报错
regressor_ols=sm.OLS(endog=y_train,exog=x_opt).fit()
regressor_ols.summary()
x_opt=x_train[:,[0,3]].astype(float)#注意这里需要转化为float类型否则会报错
regressor_ols=sm.OLS(endog=y_train,exog=x_opt).fit()
regressor_ols.summary()
为什么多项式回归也叫线性?
不是指自变量都只能是一次的,而是指自变量的系数是不是线性的
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values
# Splitting the dataset into the Training set and Test set
"""from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""
# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)"""
# Fitting Linear Regression to the dataset
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)
# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y)
# Visualising the Linear Regression results
plt.scatter(X, y, color = 'red')
plt.plot(X, lin_reg.predict(X), color = 'blue')
plt.title('Truth or Bluff (Linear Regression)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()
# Visualising the Polynomial Regression results
plt.scatter(X, y, color = 'red')
plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue')
plt.title('Truth or Bluff (Polynomial Regression)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()
在已知的回归方程中,新加自变量后 R 2 R^2 R2始终不会降低
p对 R 2 R^2 R2有惩罚作用,即p增加 R 2 R^2 R2是减少的
选择的标准: