kaggle之California Housing Data (1990)sklearn

import pandas as pd
df = pd.read_csv("housing.csv")
print(df.head())
print(df.shape)

kaggle之California Housing Data (1990)sklearn_第1张图片

df = df.drop(["longitude","ocean_proximity"],1)
df.head()
## 由于近海的值与经度的值差不多,故删除

kaggle之California Housing Data (1990)sklearn_第2张图片

df.dropna(inplace=True)  # 将其中含有缺乏值的去除  #(20433, 8)
#数据的预处理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
x = np.array(df.drop(["median_house_value"],1))
y = np.array(df["median_house_value"])
scaler = StandardScaler()
x = scaler.fit_transform(x)                 #将x标准化
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.2)     #训练集为0.8,测试集为0.2
print(train_x.shape,test_x.shape)    ##(16346, 7) (4087, 7)
print(train_y.shape,test_y.shape)    ##(16346,) (4087,)
#定义训练模型
def train_test(model,train_x,train_y,test_x,test_y):
    model.fit(train_x,train_y)
    R_train = model.score(train_x,train_y)
    R_test = model.score(test_x,test_y)
    return R_train,R_test
# 模型的训练
from sklearn.linear_model import LinearRegression,LassoCV,RidgeCV
import  matplotlib.pyplot as plt   

models = [LinearRegression(),LassoCV(alphas=[0.1, 1.0, 10.0]),RidgeCV()]
for i,model in enumerate(models):
    R_train,R_test = train_test(model,train_x,train_y,test_x,test_y)
    print(model)
    print("model_train_R^2:",R_train)
    print("model_test_R^2:",R_test)
    pre_y = model.predict(test_x)
    plt.subplot(3,1,i+1)
    plt.scatter(pre_y,test_y)
    plt.xlabel("pre_x")
    plt.ylabel("real_x")
    plt.title("pre_x VS real_x")
    plt.grid()              # 生成网格线
    x = np.arange(1,700000)
    y = x
    plt.plot(x,y,'r')
    plt.show()

kaggle之California Housing Data (1990)sklearn_第3张图片
由于笔者第一次用深度学习模型预测价格,故准确度不高,期待指点,谢谢。

你可能感兴趣的:(机器学习,机器学习,python)