数据分析之工业蒸汽量预测

导包

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import matplotlib as mtl
%matplotlib inline

import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings("ignore")

导入数据

with open("./zhengqi_train.txt") as fr:
    data_train = pd.read_table(fr,"\t")
with open("./zhengqi_test.txt") as fr_test:
    data_test = pd.read_table(fr_test,"\t")

查看数据集的基本信息

data_train.shape,data_test.shape

data_train.describe()
data_test.describe()

data_train.info(),data_test.info()

第一步清洗,清洗分布不均衡的数据

# 第一步,画图对比测试集和训练集的分布情况,删除分布不太均衡的数据
# g = plt.subplot()
sns.set()
for column_train in data_train.columns.tolist()[:-1]:
    fig = plt.figure(figsize=(8,5))
    g = plt.subplot()
    sns.kdeplot(data_train[column_train],color='red',ax=g ,label='Train',shade=True)
    sns.kdeplot(data_test[column_train],color="blue",ax=g ,label="Test",shade=True)
    g.set_xlabel(column_train)
    g.set_ylabel("Frequency")
#绘制其直方图和kde图(distplot图),查看数据分布

for column in data_test.columns.tolist():
    fig = plt.figure(figsize=(8,2))
    
    plt.subplot(1,2,1)
    sns.distplot(data_train[column])
#     plt.title("train")
    
    plt.subplot(1,2,2)
    sns.distplot(data_test[column])
#     plt.title("test")
    plt.show()

** 根据以上两个图像显示得,删除特征"V5",“V9”,“V11”,“V17”,“V22”,“V28”,训练集和测试集分布不均**

drop_index = ["V5","V9","V11","V17","V22","V28"]
good_train = data_train.drop(labels=drop_index,axis=1).copy()
good_train.shape

第二步清洗,清洗目标和特征相关性低的数据

# 得到用于训练的数据集,X,y 进行数据分析  
# [:,:-1] 表示行全取,列从第一行取到倒数第一行,开区间(倒数第一行不取)
X = good_train.iloc[:,:-1]
y = good_train["target"]

# 相关性回归分析
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

for column in X.columns.tolist():
    # 喂数据(训练数据)
    X_train = X[column].values.reshape(-1,1)
    lr.fit(X_train,y)
    X_test = np.linspace(X_train.min(),X_train.max(),100).reshape(-1,1)
    y_ = lr.predict(X_test)
    # 根据训练集和测试集画图
    plt.scatter(X_train,y,color="blue")
    plt.plot(X_test, y_,color= 'red')
    plt.xlabel(column)
    plt.show()

# 在seaborn库中的colorbrewer集中选择一个调色板
cmap = sns.choose_colorbrewer_palette('diverging', as_cmap=True)
# 绘制热力图,观察不同特征对目标的影响程度
plt.figure(figsize=(24,24))
column = good_train.columns.tolist()  # 列表头
# 任意俩特征之间的相关系数矩阵
mcorr = good_train[column].corr(method="pearson")  # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
# cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(good_train.corr(), cmap=cmap, annot=True, mask=mask)


# 得到target 列和其他特征值之间的相关系数
corr = good_train.corr()["target"]

# 定义一个相关系数的阈值,低于这个阈值的认为相关性不高,该特征值则可删除
threshold = 0.15
drop_index1 = corr.loc[np.abs(corr) < threshold].index

#确定最终删除的特征并删除
good_train1 = good_train.drop(labels=drop_index1, axis=1)
good_train1.shape

异常值处理

# 得到所有大于三倍标准差值的所有存在异常值的行的index
exception_index = good_train1.loc[(good_train1 > good_train1.std()*3).any(axis=1)].index

# 删除异常值的行
good_train2 = good_train1.drop(labels=exception_index).copy()

# 对good_train2 画箱线图
for column in good_train2.columns.tolist():
    plt.figure(figsize=(10,3))
    plt.subplot(1,2,1)
    sns.boxplot(y=good_train2[column])
    
    plt.subplot(1,2,2)
    sns.distplot(good_train2[column])
    plt.show()

# 由上面箱线图得知,超出上限和下限的点比较多,对超出限制值的
# 定义一个函数,求四分之一位值和四分之三位值进而求上限和下限值
def caculate_level(V):
    Q1 = np.quantile(V, 0.25)
    Q3 = np.quantile(V, 0.75)
    
    IQR = Q3 - Q1
    max_level = Q3 + 1.5*IQR
    min_level = Q1 - 1.5*IQR
    return max_level,min_level

# 以上边界值填充超过上边界的离群点,下边界的值填充超过下边界的离群点
for column in good_train1.columns.tolist():
    V = good_train1[column]
    max_level, min_level = caculate_level(V)
    V[V > max_level] = max_level
    V[V < min_level] = min_level

# 得到两种得优方式。
std_train = good_train2.copy()
box_train = good_train1.copy()

算法选择

# 导入四种常用算法
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 定义一个函数,用来构建算法模型
def caculate_mse(model, samples):
    train = samples.iloc[:,:-1]
    target = samples["target"]
    
    train_scores = []
    test_scores = []
    for i in range(50):
        X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3)
        model.fit(X_train, y_train)
        train_scores.append(mean_squared_error(y_train, model.predict(X_train)))
        test_scores.append(mean_squared_error(y_test, model.predict(X_test)))
    return np.array(train_scores), np.array(test_scores)

# 定义四个算法模型
knn = KNeighborsRegressor()
lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()

models = [knn, lr, ridge, lasso]
# 构建函数,求数据集的泛化误差和经验误差
def show_result(models, samples):
    for model in models:
        train_score, test_score = caculate_mse(model, samples)
        print("{} 的经验误差为:{}, 泛化误差为:{}".format(model.__class__.__name__, train_score.mean(), test_score.mean()))

查看std_train模型上不同算法的表现

show_result(models, std_train)

查看box_train模型上不同算法的表现

show_result(models, box_train)

查看原始模型上不同算法的表现

show_result(models, data_train)

调参,优化算法,得到最优算法

时间太紧,没写完

你可能感兴趣的:(数据分析根据特征做预测)