3.1 一元线性回归
3.2 多元线性回归
3.3 对数几率回归,线性判别分析(二选一)
3.4 类别不均衡
使用Kaggle房价预测数据集:
import numpy as np
import pandas as pd
# 读取数据
data = pd.read_csv('./train.csv')
# 丢弃有缺失值的特征(列)
data.dropna(axis = 1, inplace = True)
# 只保留整数的特征
data = data[[col for col in data.dtypes.index if data.dtypes[col] == 'int64']]
data.head()
features = ['LotArea', 'BsmtUnfSF', 'GarageArea']
target = 'SalePrice'
data = data[features + [target]]
from sklearn.utils import shuffle
data = shuffle(data, random_state = 32) # 这个32不要改变
#输出打乱顺序后的前五行
data.head()
num_of_samples = data.shape[0]
#输出样例的数量
print(num_of_samples)
split_line = int(num_of_samples * 0.7)
#训练集的数量
print(split_line)
train_data = data.iloc[:split_line]
test_data = data.iloc[split_line:]
def get_w(x, y):
sumX = 0
sumY = 0
for i in range(0, x.count()):
sumX = sumX + x.iat[i]
sumY = sumY + y.iat[i]
averX = sumX / x.count()
averY = sumY / y.count()
w = 0.0
wU = 0.0
wD = 0.0
for i in range(0, x.count()):
wU = wU + y.iat[i] * (x.iat[i] - averX)
wD = wD + x.iat[i] * x.iat[i]
wD = wD - sumX * sumX / x.count()
w = wU / wD
return w
def get_b(x, y, w):
b = 0.0
for i in range(0, x.count()):
b = b + y.iat[i] - w*x.iat[i]
b = b / x.count()
return b
class myLinearRegression:
def __init__(self):
self.w = None
self.b = None
def fit(self, x, y):
self.w = get_w(x, y)
self.b = get_b(x, y, self.w)
def predict(self, x):
if self.w == None or self.b == None:
print("模型还未训练,请先调用fit方法训练")
return
return self.w * x + self.b
# 创建一个模型的实例
model1 = myLinearRegression()
# 使用训练集对模型进行训练,传入训练集的LotArea和标记SalePrice
model1.fit(train_data['LotArea'], train_data['SalePrice'])
# 对测试集进行预测,并将结果存储在变量prediction中
prediction1 = model1.predict(test_data['LotArea'])
def MAE(y_hat, y):
ans = 0
for i in range(0, y.count()):
temp = y_hat.iat[i] - y.iat[i]
if temp < 0:
temp = -temp
ans = ans + temp
ans = ans / y.count()
return ans
import math
def RMSE(y_hat, y):
ans = 0
for i in range(0, y.count()):
temp = y_hat.iat[i] - y.iat[i]
temp = temp ** 2
ans = ans + temp
ans = ans / y.count()
ans = math.sqrt(ans)
return ans
mae1 = MAE(prediction1, test_data['SalePrice'])
rmse1 = RMSE(prediction1, test_data['SalePrice'])
print("模型1,特征:LotArea")
print("MAE:", mae1)
print("RMSE:", rmse1)
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize = (16, 6))
plt.subplot(121)
plt.plot(train_data['LotArea'].values, train_data['SalePrice'].values, '.', label = 'training data')
plt.plot(train_data['LotArea'].values, model1.predict(train_data['LotArea']), '-', label = 'prediction')
plt.xlabel("LotArea")
plt.ylabel('SalePrice')
plt.title("training set")
plt.legend()
# 创建实例
model2 = myLinearRegression()
# 使用训练集对模型进行训练,传入训练集的LotArea和标记SalePrice
model2.fit(train_data['BsmtUnfSF'], train_data['SalePrice'])
# 对测试集进行预测,并将结果存储在变量prediction中
prediction = model2.predict(test_data['BsmtUnfSF'])
mae3 = MAE(prediction, test_data['SalePrice'])
rmse3 = RMSE(prediction, test_data['SalePrice'])
print("模型2,特征:BsmtUnfSF")
print("MAE:", mae3)
print("RMSE:", rmse3)
plt.figure(figsize = (16, 6))
plt.subplot(121)
plt.plot(train_data['BsmtUnfSF'].values, train_data['SalePrice'].values, '.', label = 'training data')
plt.plot(train_data['BsmtUnfSF'].values, model2.predict(train_data['BsmtUnfSF']), '-', label = 'prediction')
plt.xlabel('BsmtUnfSF')
plt.ylabel('SalePrice')
plt.title("training set")
plt.legend()
plt.yticks(np.arange(0, 800000, 100000))
plt.subplot(122)
plt.plot(test_data['BsmtUnfSF'].values, test_data['SalePrice'].values, '.', label='training data')
plt.plot(test_data['BsmtUnfSF'].values, model2.predict(test_data['BsmtUnfSF']), '-', label='prediction')
plt.xlabel('BsmtUnfSF')
plt.ylabel('SalePrice')
plt.title("testing set")
plt.yticks(np.arange(0, 800000, 100000))
plt.show()
# 创建实例
model3 = myLinearRegression()
# 使用训练集对模型进行训练,传入训练集的LotArea和标记SalePrice
model3.fit(train_data['GarageArea'], train_data['SalePrice'])
# 对测试集进行预测,并将结果存储在变量prediction中
prediction = model3.predict(test_data['GarageArea'])
mae3 = MAE(prediction, test_data['SalePrice'])
rmse3 = RMSE(prediction, test_data['SalePrice'])
print("模型3,特征:GarageArea")
print("MAE:", mae3)
print("RMSE:", rmse3)
plt.figure(figsize = (16, 6))
plt.subplot(121)
plt.plot(train_data['GarageArea'].values, train_data['SalePrice'].values, '.', label = 'training data')
plt.plot(train_data['GarageArea'].values, model3.predict(train_data['GarageArea']), '-', label = 'prediction')
plt.xlabel('GarageArea')
plt.ylabel('SalePrice')
plt.title("training set")
plt.legend()
plt.yticks(np.arange(0, 800000, 100000))
plt.subplot(122)
plt.plot(test_data['GarageArea'].values, test_data['SalePrice'].values, '.', label='training data')
plt.plot(test_data['GarageArea'].values, model3.predict(test_data['GarageArea']), '-', label='prediction')
plt.xlabel('GarageArea')
plt.ylabel('SalePrice')
plt.title("testing set")
plt.yticks(np.arange(0, 800000, 100000))
plt.show()
t = train_data[(train_data['LotArea'] < 60000) & (train_data['LotArea'] > 0)] # 将训练集中LotArea小于60000的值存入t
t = t[t['SalePrice'] < 500000] # 将t中SalePrice小于500000的值保留
# 绘制处理后的数据
plt.figure(figsize = (8, 7))
plt.plot(t['LotArea'], t['SalePrice'], '.')
plt.show()
num_of_samples = t.shape[0]
split_line = int(num_of_samples * 0.7)
train_data2 = t.iloc[:split_line]
test_data2 = t.iloc[split_line:]
model = myLinearRegression()
model.fit(train_data2['LotArea'], train_data2['SalePrice'])
prediction = model.predict(test_data2['LotArea'])
mae = MAE(prediction, test_data2['SalePrice'])
rmse = RMSE(prediction, test_data2['SalePrice'])
print("特征:LotArea")
print("MAE:", mae)
print("RMSE:", rmse)
使用多个特征作为输入完成房价预测问题,计算模型在十折交叉验证上MAE和RMSE的值,比较不同的特征组合在模型预测能力上的影响。
模型可使用sklearn.linear_model.LinearRegression
选做:多项式回归(一元线性回归的扩展),尝试对部分特征进行变换,如将其二次幂,三次幂作为特征输入模型,观察模型在预测能力上的变化。
这部分的内容是要求大家完成多元线性回归,我们会先带着大家使用sklearn做一元线性回归的十折交叉验证,多元线性回归大家可以仿照着完成
同3.1
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
model = LinearRegression()
features = ['LotArea']
x = data[features]
y = data['SalePrice']
prediction = cross_val_predict(model, x, y, cv = 10)
prediction.shape
mean_absolute_error(prediction, data['SalePrice'])
mean_squared_error(prediction, data['SalePrice']) ** 0.5
MULmodel1 = LinearRegression()
features1 = ['LotArea', 'MSSubClass', 'TotalBsmtSF']
x = data[features1]
y = data['SalePrice']
prediction = cross_val_predict(MULmodel1, x, y, cv = 10)
mean_absolute_error(prediction, data['SalePrice'])
mean_squared_error(prediction, data['SalePrice']) ** 0.5
MULmodel2 = LinearRegression()
features2 = ['1stFlrSF', '2ndFlrSF', 'MSSubClass', 'TotalBsmtSF']
x = data[features2]
y = data['SalePrice']
prediction = cross_val_predict(MULmodel2, x, y, cv=10)
mean_absolute_error(prediction, data['SalePrice'])
mean_squared_error(prediction, data['SalePrice']) ** 0.5
print("MAE:", mean_absolute_error(prediction, data['SalePrice']))
print("RMAE", mean_squared_error(prediction, data['SalePrice']) ** 0.5)
MULmodel3 = LinearRegression()
features3 = ['1stFlrSF', '2ndFlrSF', 'YrSold', 'GarageArea', 'TotalBsmtSF']
x = data[features3]
y = data['SalePrice']
prediction = cross_val_predict(MULmodel3, x, y, cv=10)
mean_absolute_error(prediction, data['SalePrice'])
mean_squared_error(prediction, data['SalePrice']) ** 0.5
print("MAE:", mean_absolute_error(prediction, data['SalePrice']))
print("RMAE", mean_squared_error(prediction, data['SalePrice']) ** 0.5)
import numpy as np
import pandas as pd
data = pd.read_csv('./breast-cancer.csv')
data.head()
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict
from sklearn import preprocessing
# y=diagnosis,x=radius_mean:
# data_x=...
num_of_samples = data.shape[0]
split_line = int(num_of_samples * 0.7)
train_data = data.iloc[:split_line]
test_data = data.iloc[split_line:]
features = ['radius_mean', 'texture_mean', 'perimeter_mean',
'area_mean', 'smoothness_mean', 'compactness_mean',
'concavity_mean', 'concave points_mean', 'symmetry_mean',
'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se',
'area_se', 'smoothness_se', 'compactness_se', 'concavity_se',
'concave points_se', 'symmetry_se', 'fractal_dimension_se',
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
'smoothness_worst', 'compactness_worst', 'concavity_worst',
'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
data_x = train_data[features]
test_x = test_data[features]
data_y = train_data['diagnosis']
test_y = test_data['diagnosis']
# # YOUR CODE HERE
# model = LogisticRegression(max_iter = 10000)
# prediction = ...
model = LogisticRegression(max_iter = 10000)
model.fit(data_x,data_y)
prediction = model.predict(test_x)
def evalute(prediction, test_y):
acc = accuracy_score(test_y, prediction)
pre = precision_score(test_y, prediction,pos_label='B')
recall = recall_score(test_y, prediction,pos_label='B')
f1 = recall_score(test_y, prediction,pos_label='B')
return acc, pre, recall, f1
evalute(prediction, test_y)
类别不均衡
使用imblearn实现SMOTE过采样,RandomUnderSampler降采样
import pandas as pd
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from imblearn.datasets import make_imbalance
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')
#balanced dataset
train_X, train_y = make_classification(n_samples=700, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1,
class_sep=1.0, flip_y=0.06, random_state=100)
plt.title("Balanced dataset")
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(train_X[:, 0], train_X[:, 1], marker='o', c=train_y,
s=25, edgecolor='k', cmap=plt.cm.coolwarm)
plt.show()
train_X1, train_y1 = make_imbalance(train_X, train_y, sampling_strategy={0: 340, 1: 10}, random_state=100)
plt.title("Imbalanced dataset")
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(train_X1[:, 0], train_X1[:, 1], marker='o', c=train_y1,
s=25, edgecolor='k', cmap=plt.cm.coolwarm)
plt.show()
sm_model = SMOTE(sampling_strategy='auto', k_neighbors=8, random_state=100)
train_X2, train_y2 = sm_model.fit_resample(train_X1, train_y1)
plt.title('SMOTE')
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(train_X2[:, 0], train_X2[:, 1], marker='o', c=train_y2,
s=25, edgecolor='k', cmap=plt.cm.coolwarm)
plt.show()
rus_model = RandomUnderSampler(random_state=100)
train_X3, train_y3 = rus_model.fit_resample(train_X1, train_y1)
plt.title('RUS')
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(train_X3[:, 0], train_X3[:, 1], marker='o', c=train_y3,
s=25, edgecolor='k', cmap=plt.cm.coolwarm)
plt.show()
在样本不均衡的癌症数据集上利用SMOTE,对比使用过采样算法前后各评价指标的差异
raw_data = pd.read_csv('./breast-cancer.csv')
raw_data
print(len(raw_data[raw_data['diagnosis'] == 'M']))
print(len(raw_data[raw_data['diagnosis'] == 'B']))
# 将癌症数据集处理为样本更加不均衡的数据如100:357
names = raw_data[raw_data['diagnosis'] == 'M'].sample(n=112).index
data = raw_data.drop(names, inplace=True)
num_of_samples = data.shape[0]
split_line = int(num_of_samples * 0.7)
train_data = data.iloc[:split_line]
test_data = data.iloc[split_line:]
features = ['radius_mean', 'texture_mean', 'perimeter_mean',
'area_mean', 'smoothness_mean', 'compactness_mean',
'concavity_mean', 'concave points_mean', 'symmetry_mean',
'fractal_dimension_mean radius_se', 'texture_se perimeter_se',
'area_se', 'smoothness_se', 'compactness_se', 'concavity_se',
'concave points_se', 'symmetry_se', 'fractal_dimension_se',
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
'smoothness_worst', 'compactness_worst', 'concavity_worst',
'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
data_x = train_data[features]
test_x = test_data[features]
data_y = train_data['diagnosis']
test_y = test_data['diagnosis']
# 预测并评价指标
model1 = LogisticRegression(max_iter=10000)
model1.fit(data_x, data_y)
prediction1 = model1.predict(test_x)
print("1 过采样处理前:", evalute(prediction1, test_y))
def evalute(prediction, test_y):
acc = accuracy_score(test_y, prediction)
pre = precision_score(test_y, prediction,pos_label='B')
recall = recall_score(test_y, prediction,pos_label='B')
f1 = recall_score(test_y, prediction,pos_label='B')
return acc, pre, recall, f1
# 将处理后的数据集进行SMOTE过采样
sm_model = SMOTE(sampling_strategy='auto', k_neighbors=8, random_state=100)
data_X2, data_y2 = sm_model.fit_resample(data_x, data_y)
# 预测并评价指标
model2 = LogisticRegression(max_iter=10000)
model2.fit(data_X2, data_y2)
prediction2 = model2.predict(test_x)
print("2 过采样处理后:", evalute(prediction2, test_y))