import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import matplotlib as mtl
%matplotlib inline
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings("ignore")
with open("./zhengqi_train.txt") as fr:
data_train = pd.read_table(fr,"\t")
with open("./zhengqi_test.txt") as fr_test:
data_test = pd.read_table(fr_test,"\t")
data_train.shape,data_test.shape
data_train.describe()
data_test.describe()
data_train.info(),data_test.info()
# 第一步,画图对比测试集和训练集的分布情况,删除分布不太均衡的数据
# g = plt.subplot()
sns.set()
for column_train in data_train.columns.tolist()[:-1]:
fig = plt.figure(figsize=(8,5))
g = plt.subplot()
sns.kdeplot(data_train[column_train],color='red',ax=g ,label='Train',shade=True)
sns.kdeplot(data_test[column_train],color="blue",ax=g ,label="Test",shade=True)
g.set_xlabel(column_train)
g.set_ylabel("Frequency")
#绘制其直方图和kde图(distplot图),查看数据分布
for column in data_test.columns.tolist():
fig = plt.figure(figsize=(8,2))
plt.subplot(1,2,1)
sns.distplot(data_train[column])
# plt.title("train")
plt.subplot(1,2,2)
sns.distplot(data_test[column])
# plt.title("test")
plt.show()
** 根据以上两个图像显示得,删除特征"V5",“V9”,“V11”,“V17”,“V22”,“V28”,训练集和测试集分布不均**
drop_index = ["V5","V9","V11","V17","V22","V28"]
good_train = data_train.drop(labels=drop_index,axis=1).copy()
good_train.shape
# 得到用于训练的数据集,X,y 进行数据分析
# [:,:-1] 表示行全取,列从第一行取到倒数第一行,开区间(倒数第一行不取)
X = good_train.iloc[:,:-1]
y = good_train["target"]
# 相关性回归分析
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
for column in X.columns.tolist():
# 喂数据(训练数据)
X_train = X[column].values.reshape(-1,1)
lr.fit(X_train,y)
X_test = np.linspace(X_train.min(),X_train.max(),100).reshape(-1,1)
y_ = lr.predict(X_test)
# 根据训练集和测试集画图
plt.scatter(X_train,y,color="blue")
plt.plot(X_test, y_,color= 'red')
plt.xlabel(column)
plt.show()
# 在seaborn库中的colorbrewer集中选择一个调色板
cmap = sns.choose_colorbrewer_palette('diverging', as_cmap=True)
# 绘制热力图,观察不同特征对目标的影响程度
plt.figure(figsize=(24,24))
column = good_train.columns.tolist() # 列表头
# 任意俩特征之间的相关系数矩阵
mcorr = good_train[column].corr(method="pearson") # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
# cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(good_train.corr(), cmap=cmap, annot=True, mask=mask)
# 得到target 列和其他特征值之间的相关系数
corr = good_train.corr()["target"]
# 定义一个相关系数的阈值,低于这个阈值的认为相关性不高,该特征值则可删除
threshold = 0.15
drop_index1 = corr.loc[np.abs(corr) < threshold].index
#确定最终删除的特征并删除
good_train1 = good_train.drop(labels=drop_index1, axis=1)
good_train1.shape
# 得到所有大于三倍标准差值的所有存在异常值的行的index
exception_index = good_train1.loc[(good_train1 > good_train1.std()*3).any(axis=1)].index
# 删除异常值的行
good_train2 = good_train1.drop(labels=exception_index).copy()
# 对good_train2 画箱线图
for column in good_train2.columns.tolist():
plt.figure(figsize=(10,3))
plt.subplot(1,2,1)
sns.boxplot(y=good_train2[column])
plt.subplot(1,2,2)
sns.distplot(good_train2[column])
plt.show()
# 由上面箱线图得知,超出上限和下限的点比较多,对超出限制值的
# 定义一个函数,求四分之一位值和四分之三位值进而求上限和下限值
def caculate_level(V):
Q1 = np.quantile(V, 0.25)
Q3 = np.quantile(V, 0.75)
IQR = Q3 - Q1
max_level = Q3 + 1.5*IQR
min_level = Q1 - 1.5*IQR
return max_level,min_level
# 以上边界值填充超过上边界的离群点,下边界的值填充超过下边界的离群点
for column in good_train1.columns.tolist():
V = good_train1[column]
max_level, min_level = caculate_level(V)
V[V > max_level] = max_level
V[V < min_level] = min_level
# 得到两种得优方式。
std_train = good_train2.copy()
box_train = good_train1.copy()
# 导入四种常用算法
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# 定义一个函数,用来构建算法模型
def caculate_mse(model, samples):
train = samples.iloc[:,:-1]
target = samples["target"]
train_scores = []
test_scores = []
for i in range(50):
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3)
model.fit(X_train, y_train)
train_scores.append(mean_squared_error(y_train, model.predict(X_train)))
test_scores.append(mean_squared_error(y_test, model.predict(X_test)))
return np.array(train_scores), np.array(test_scores)
# 定义四个算法模型
knn = KNeighborsRegressor()
lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
models = [knn, lr, ridge, lasso]
# 构建函数,求数据集的泛化误差和经验误差
def show_result(models, samples):
for model in models:
train_score, test_score = caculate_mse(model, samples)
print("{} 的经验误差为:{}, 泛化误差为:{}".format(model.__class__.__name__, train_score.mean(), test_score.mean()))
show_result(models, std_train)
show_result(models, box_train)
show_result(models, data_train)
时间太紧,没写完