import pandas as pd
# 查看文件相关路径
iowa_file_path = '../input/home-data-for-ml-course/train.csv'
# 读取数据并保存为 DataFrame 格式 ,以train.csv数据为例
home_data = pd.read_csv(iowa_file_path)
home_data.shape
结果:
(1460, 81)
home_data.columns
结果:
Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal',
'MoSold', 'YrSold', 'SaleType', 'SaleCondition'],
dtype='object')
home_data.head()
# 打印 home_data 的数据集
home_data.describe()
Rooms Price Distance Postcode Bedroom2 \
count 13580.000000 1.358000e+04 13580.000000 13580.000000 13580.000000
mean 2.937997 1.075684e+06 10.137776 3105.301915 2.914728
std 0.955748 6.393107e+05 5.868725 90.676964 0.965921
min 1.000000 8.500000e+04 0.000000 3000.000000 0.000000
25% 2.000000 6.500000e+05 6.100000 3044.000000 2.000000
50% 3.000000 9.030000e+05 9.200000 3084.000000 3.000000
75% 3.000000 1.330000e+06 13.000000 3148.000000 3.000000
max 10.000000 9.000000e+06 48.100000 3977.000000 20.000000
Bathroom Car Landsize BuildingArea YearBuilt \
count 13580.000000 13518.000000 13580.000000 7130.000000 8205.000000
mean 1.534242 1.610075 558.416127 151.967650 1964.684217
std 0.691712 0.962634 3990.669241 541.014538 37.273762
min 0.000000 0.000000 0.000000 0.000000 1196.000000
25% 1.000000 1.000000 177.000000 93.000000 1940.000000
50% 1.000000 2.000000 440.000000 126.000000 1970.000000
75% 2.000000 2.000000 651.000000 174.000000 1999.000000
max 8.000000 10.000000 433014.000000 44515.000000 2018.000000
Lattitude Longtitude Propertycount
count 13580.000000 13580.000000 13580.000000
mean -37.809203 144.995216 7454.417378
std 0.079260 0.103916 4378.581772
min -38.182550 144.431810 249.000000
25% -37.856822 144.929600 4380.000000
50% -37.802355 145.000100 6555.000000
75% -37.756400 145.058305 10331.000000
max -37.408530 145.526350 21650.000000
output = pd.DataFrame({'Id': test_data.Id,
'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)
去除结束最好借助home_data.shape检查一下去掉了多少
home_data = home_data.dropna(axis=0)
home_data = home_data.dropna(axis=0)
或
home_data = home_data.dropna(axis=0,how='any')
home_data = home_data.dropna(axis='columns', thresh=10)
home_data = home_data.dropna(axis='index', how='all', subset=['Alley','FireplaceQu'])
# 获取缺少值的列的名称
cols_with_missing = [col for col in X_train.columns
if X_train[col].isnull().any()]
# 删除训练和验证数据中的列
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
print("删除缺省列后的MAE值:")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
all_data.pop('income_bracket')
from sklearn.impute import SimpleImputer
# 插补,生成新的训练特征和验证特征,暂时没有列名
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# 对新的训练特征和验证特征赋予真实的列名
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
print("插补后的MAE值:")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))
# 制作副本以避免更改原始数据(输入时)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()
# 制作新的栏目,标明因缺省需要新增的列
for col in cols_with_missing:
X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
# 插补,生成新的训练特征和验证特征,暂时没有列名
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))
# 对新的训练特征和验证特征赋予真实的列名
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns
print("插补扩展后的MAE值:")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))
.
取出目标值
y = home_data.Price
y = home_data.['Price']
结果:
1 181500
2 223500
3 140000
4 250000
6 307000
...
1451 287090
1454 185000
1455 175000
1456 210000
1457 266500
通常预测结果我们定义为:y
home_data_features = ['LotArea', 'LotConfig']
X = home_data[home_data_features]
结果:
LotArea LotConfig
1 9600 FR2
2 11250 Inside
3 9550 Corner
4 14260 FR2
6 10084 Inside
... ... ...
1451 9262 Inside
1454 7500 Inside
1455 7917 Inside
1456 13175 Inside
1457 9042 Inside
通常已知数据集我们定义为:X
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
print("MAE值:")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))
from sklearn.preprocessing import OrdinalEncoder
# 制作副本以避免更改原始数据
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()
# 对包含分类数据的每一列应用顺序编码器
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])
print("MAE值:")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))
from sklearn.preprocessing import OneHotEncoder
# 对包含分类数据的每一列生成one-hot编码列
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
# One-hot编码索引重置
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
# 删除原始分类列,比如Color列
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
# 将one-hot编码列加入其中,比如Red\Yellow\Green
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
print("MAE值:")
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))
import pandas as pd
df = pd.DataFrame([
['green', 'A'],
['red', 'B'],
['blue', 'A']])
df.columns = ['color', 'class']
print(df)
结果:
color class
0 green A
1 red B
2 blue A
df=pd.get_dummies(df,columns=["color"])
print(df)
结果
class color_blue color_green color_red
0 A 0 1 0
1 B 0 0 1
2 A 1 0 0
import pandas as pd
import numpy as np
s1 = pd.Series(['A', 7, 6, 3, 4, 1, 2, 3, 5, 4, 1, 1])
print('s1中不同值s1.unique():', s1.unique())
print('s1中不同值的个数len(s1.unique()):', len(s1.unique()))
print('s1中不同值的个数s1.nunique():', s1.nunique())
# 当存在Nan、None时
print('='*30)
s2 = pd.Series(['A', 7, 6, 3, np.NAN, np.NaN,4, 1, 2, 3, 5, 4, 1, 1, pd.NaT, None])
print('s2中不同值s2.unique():', s2.unique())
print('s2中不同值的个数len(s2.unique()):', len(s2.unique()))
print('s2中不同值的个数s2.nunique():', s2.nunique())
print('s2中不同值的个数(包含空值)s2.nunique(dropna=False):', s2.nunique(dropna=False))
print('s2中不同值的个数(不包含空值)s2.nunique(dropna=True):', s2.nunique(dropna=True))
结果:
s1中不同值s1.unique(): ['A' 7 6 3 4 1 2 5]
s1中不同值的个数len(s1.unique()): 8
s1中不同值的个数s1.nunique(): 8
==============================
s2中不同值s2.unique(): ['A' 7 6 3 nan 4 1 2 5 NaT None]
s2中不同值的个数len(s2.unique()): 11
s2中不同值的个数s2.nunique(): 8
s2中不同值的个数(包含空值)s2.nunique(dropna=False): 11
s2中不同值的个数(不包含空值)s2.nunique(dropna=True): 8
import numpy as np
df = np.random.randn(12)
print(df)
结果:
[-0.17784745 0.65779432 0.1805618 -2.19602499 0.00607502 -0.66123608
-0.91577412 -0.67034686 0.20401882 0.79440181 1.01153642 -1.25569377]
df2=df.reshape(3,4)
print(df2)
结果:
[[-0.82830849 0.03707941 -1.11532038 -1.34872846]
[ 0.64435623 -1.62929858 -0.80895497 1.03181436]
[-0.515589 -0.51417676 0.26556107 0.90369897]]
df3=df.reshape(-1,1)
print(df3)
结果:
[[-0.82830849]
[ 0.03707941]
[-1.11532038]
[-1.34872846]
[ 0.64435623]
[-1.62929858]
[-0.80895497]
[ 1.03181436]
[-0.515589 ]
[-0.51417676]
[ 0.26556107]
[ 0.90369897]]
import numpy as np
df = np.random.randn(27)
print(df)
df2=df.reshape(3, 3, 3)
print(df2)
结果:
[-1.07596492 0.27042494 0.76922574 0.24121245 -0.18644408 0.88443699
0.1103836 1.04733736 -1.03461271 1.03084316 -0.36068049 0.28790821
-0.01229269 -1.76622567 0.12916125 1.47422885 -1.06993507 1.97568836
-1.5479976 -0.17388855 -0.4299137 1.34085045 -0.87184133 2.33509748
-1.32298034 0.77624622 0.60807644]
[[[-1.07596492 0.27042494 0.76922574]
[ 0.24121245 -0.18644408 0.88443699]
[ 0.1103836 1.04733736 -1.03461271]]
[[ 1.03084316 -0.36068049 0.28790821]
[-0.01229269 -1.76622567 0.12916125]
[ 1.47422885 -1.06993507 1.97568836]]
[[-1.5479976 -0.17388855 -0.4299137 ]
[ 1.34085045 -0.87184133 2.33509748]
[-1.32298034 0.77624622 0.60807644]]]
Process finished with exit code 0
import numpy as np
df = np.random.randn(120)
print(df)
df2=df.reshape(2, 3, 4,5)
print(df2)
结果:
[-0.8488991 -1.43473636 -1.34970231 -0.24481118 0.31764633 0.14514851
0.38238256 0.53851275 0.75027389 -0.94441815 -0.4415453 -1.70829409
-1.49105147 -0.2922517 0.16547985 0.63473937 0.23333589 -0.97134231
-0.16172712 0.46714426 -0.30596387 0.2899302 0.44195535 -0.73779196
-1.13248888 0.89841681 1.97206531 -0.09296597 1.72234349 -1.48621945
0.84393324 -0.06435082 -0.7072044 0.73144364 -0.2834176 -1.21938154
0.31254625 0.37791245 0.24216449 0.99774761 1.17480164 -0.73960522
-1.410521 -0.19231122 -1.98715833 -1.91236891 -0.76014017 0.2675738
-1.08170696 -1.43132475 -0.95395155 1.58220867 -0.03424183 -0.9581383
1.05898571 -0.76216662 1.85830849 -0.28745795 -1.14517515 1.18003285
1.93009716 -1.00053648 -1.63087727 -1.37146257 0.74471147 0.25955097
0.71043632 -0.11985199 -0.99784999 1.05734577 1.58113723 -0.33510501
-1.14423267 -1.74070268 -0.8160744 -0.39958888 -0.84590673 -1.87116062
-1.51583484 2.26539864 0.56579071 -0.17420101 0.09740256 0.19449168
-0.36205542 -1.42606861 -1.39325941 -1.6512132 -0.51424623 0.63769629
-0.67241982 -0.52715576 0.4604346 -1.06763685 -0.28290645 -1.40776408
0.91156968 -0.28360106 0.0344332 0.65260535 0.35402486 -0.37102618
2.31627643 -0.52852038 -0.44959947 0.05434873 1.62119897 1.46701724
0.15223667 -0.98622093 1.26660794 0.90424614 -0.48575745 2.05381947
1.14561646 -0.15025998 0.52653924 2.5447094 0.98145319 0.45000227]
[[[[-0.8488991 -1.43473636 -1.34970231 -0.24481118 0.31764633]
[ 0.14514851 0.38238256 0.53851275 0.75027389 -0.94441815]
[-0.4415453 -1.70829409 -1.49105147 -0.2922517 0.16547985]
[ 0.63473937 0.23333589 -0.97134231 -0.16172712 0.46714426]]
[[-0.30596387 0.2899302 0.44195535 -0.73779196 -1.13248888]
[ 0.89841681 1.97206531 -0.09296597 1.72234349 -1.48621945]
[ 0.84393324 -0.06435082 -0.7072044 0.73144364 -0.2834176 ]
[-1.21938154 0.31254625 0.37791245 0.24216449 0.99774761]]
[[ 1.17480164 -0.73960522 -1.410521 -0.19231122 -1.98715833]
[-1.91236891 -0.76014017 0.2675738 -1.08170696 -1.43132475]
[-0.95395155 1.58220867 -0.03424183 -0.9581383 1.05898571]
[-0.76216662 1.85830849 -0.28745795 -1.14517515 1.18003285]]]
[[[ 1.93009716 -1.00053648 -1.63087727 -1.37146257 0.74471147]
[ 0.25955097 0.71043632 -0.11985199 -0.99784999 1.05734577]
[ 1.58113723 -0.33510501 -1.14423267 -1.74070268 -0.8160744 ]
[-0.39958888 -0.84590673 -1.87116062 -1.51583484 2.26539864]]
[[ 0.56579071 -0.17420101 0.09740256 0.19449168 -0.36205542]
[-1.42606861 -1.39325941 -1.6512132 -0.51424623 0.63769629]
[-0.67241982 -0.52715576 0.4604346 -1.06763685 -0.28290645]
[-1.40776408 0.91156968 -0.28360106 0.0344332 0.65260535]]
[[ 0.35402486 -0.37102618 2.31627643 -0.52852038 -0.44959947]
[ 0.05434873 1.62119897 1.46701724 0.15223667 -0.98622093]
[ 1.26660794 0.90424614 -0.48575745 2.05381947 1.14561646]
[-0.15025998 0.52653924 2.5447094 0.98145319 0.45000227]]]]
Process finished with exit code 0
data=np. array ([1,2,3.0, 4.11])
a = data
print(a.dtype)
print(a.astype('uint8').dtype)
print(a)
print(a.astype('int').dtype)
print(a)
结果:
float64
uint8
[1. 2. 3. 4.11]
int32
[1. 2. 3. 4.11]
from sklearn.tree import DecisionTreeRegressor
# 定义模型为random_state指定一个数字,以确保每次运行的结果相同
iowa_model= DecisionTreeRegressor(random_state=1)
# 预测目标:价格
y = home_data.SalePrice
# 模型特征
feature_names = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF",
"FullBath", "BedroomAbvGr", "TotRmsAbvGrd"]
# 定义特征集
X=home_data[feature_names]
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
其他参数介绍:
- train_size:训练集占比,训练集占数据集的比重,如果是整数的话就是训练的数量
- test_size:验证集占比,验证集占数据集的比重,如果是整数的话就是验证的数量
data_model= pd.read_csv('../input/dl-course-data/red-wine.csv')
df_train = data_model.sample(frac=0.7, random_state=0)
# 或
df_train =data_model.sample(n=200,random_state=123)
df_valid = red_wine.drop(df_train.index)
X_train = df_train.drop('quality', axis=1)
X_valid = df_valid.drop('quality', axis=1)
y_train = df_train['quality']
y_valid = df_valid['quality']
iowa_model.fit(train_X, train_y)
val_predictions = iowa_model.predict(val_X)
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y, val_predictions)
https://www.kaggle.com/code/hyon666666/exercise-underfitting-and-overfitting?scriptVersionId=119421539
# Code you have previously used to load data
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
# Path of the file to read
iowa_file_path = '../input/home-data-for-ml-course/train.csv'
home_data = pd.read_csv(iowa_file_path)
# Create target object and call it y
y = home_data.SalePrice
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)
# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))
# Set up code checking
from learntools.core import binder
binder.bind(globals())
from learntools.machine_learning.ex5 import *
print("\nSetup complete")
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
model.fit(train_X, train_y)
preds_val = model.predict(val_X)
mae = mean_absolute_error(val_y, preds_val)
return(mae)
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
scores = {leaf_size: get_mae(leaf_size, train_X,val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = min(scores, key=scores.get)
# Fill in argument to make optimal size and uncomment
final_model =DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
# fit the final model and uncomment the next two lines
final_model.fit(X, y)
import pandas as pd
# 获取数据
melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
# 筛选缺少值的行
melbourne_data = melbourne_data.dropna(axis=0)
# 选择模板及特征
y = melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea',
'YearBuilt', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
from sklearn.model_selection import train_test_split
# 拆分数据为训练集和验证集
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))
val_mae = mean_absolute_error(val_y, melb_preds)
# Set up code checking
from learntools.core import binder
binder.bind(globals())
from learntools.machine_learning.ex7 import *
# Set up filepaths
import os
if not os.path.exists("../input/train.csv"):
os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")
os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv")
# Import helpful libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
# Load the data, and separate the target
iowa_file_path = '../input/train.csv'
home_data = pd.read_csv(iowa_file_path)
y = home_data.SalePrice
# Create X (After completing the exercise, you can return to modify this line!)
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
# Select columns corresponding to features, and preview the data
X = home_data[features]
X.head()
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
# Define a random forest model
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))
# Set up code checking
import os
if not os.path.exists("../input/train.csv"):
os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")
os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv")
from learntools.core import binder
binder.bind(globals())
from learntools.ml_intermediate.ex1 import *
print("Setup Complete")
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
random_state=0)
X_train.head()
'''
LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr TotRmsAbvGrd
Id
619 11694 2007 1828 0 2 3 9
871 6600 1962 894 0 1 2 5
93 13360 1921 964 0 1 2 5
818 13265 2002 1689 0 2 3 7
303 13704 2001 1541 0 2 3 6
'''
from sklearn.ensemble import RandomForestRegressor
# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)
models = [model_1, model_2, model_3, model_4, model_5]
from sklearn.metrics import mean_absolute_error
# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
model.fit(X_t, y_t)
preds = model.predict(X_v)
return mean_absolute_error(y_v, preds)
for i in range(0, len(models)):
mae = score_model(models[i])
print("Model %d MAE: %d" % (i+1, mae))
from sklearn.metrics import mean_absolute_error
# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
model.fit(X_t, y_t)
preds = model.predict(X_v)
return mean_absolute_error(y_v, preds)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
model = RandomForestRegressor(n_estimators=10, random_state=0)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
return mean_absolute_error(y_valid, preds)
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
model.fit(train_X, train_y)
preds_val = model.predict(val_X)
mae = mean_absolute_error(val_y, preds_val)
return(mae)
# 不同的max_leaf_nodes对应不同的 MAE
for max_leaf_nodes in [5, 50, 500, 5000]:
my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))
Max leaf nodes: 5 Mean Absolute Error: 347380
Max leaf nodes: 50 Mean Absolute Error: 258171
Max leaf nodes: 500 Mean Absolute Error: 243495
Max leaf nodes: 5000 Mean Absolute Error: 254983
由此可以得出,500是一个比较合适的叶子节点
# 叶子节点集合
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# 一行代码计算叶子节点对应的MAE
scores = {leaf_size: get_mae(leaf_size, train_X,val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
# 选择最合适的叶子节点
best_tree_size = min(scores, key=scores.get)
管道是保持数据预处理和建模代码井然有序的一种简单方法。具体来说,管道将预处理和建模步骤捆绑在一起,这样就可以像使用单个步骤一样使用整个包。
import pandas as pd
from sklearn.model_selection import train_test_split
# 读取训练集
X_full = pd.read_csv('../input/train.csv', index_col='Id')
# 读取测试集
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
# 将'SalePrice'列数值为空的行删除
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
# 将'SalePrice'列数值放到y上
y = X_full.SalePrice
# 将'SalePrice'列在X_full上删除
X_full.drop(['SalePrice'], axis=1, inplace=True)
# 从训练数据中分离出验证集
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
train_size=0.8, test_size=0.2,
random_state=0)
# 选择重复值小于10且为object类型的列(一般都是字符串,重复数小于10为了便于分类变量)
categorical_cols = [cname for cname in X_train_full.columns if
X_train_full[cname].nunique() < 10 and
X_train_full[cname].dtype == "object"]
#选择'int64'和'float64'类型的列
numerical_cols = [cname for cname in X_train_full.columns if
X_train_full[cname].dtype in ['int64', 'float64']]
# 创建新的训练集、验证集、测试集,只保留选定的列数据
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# 数字数据预处理,采用插补的constant策略
numerical_transformer = SimpleImputer(strategy='constant')
# 分类数据的预处理,采用插补的most_frequent策略和OneHot编码方法
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# 数值和分类数据的束预处理
# 这里的numerical_cols和categorical_cols是刚才获取到的变量,表示数值类型的列和object类型的列
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
# 定义随机森林模型
model = RandomForestRegressor(n_estimators=100, random_state=0)
# 在管道中将预处理和建模的代码进行捆绑
clf = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)
])
# 拟合模型
clf.fit(X_train, y_train)
# 预测数值
preds = clf.predict(X_valid)
# 验证模型
print('MAE:', mean_absolute_error(y_valid, preds))
preprocessor = make_column_transformer(
(StandardScaler(),
make_column_selector(dtype_include=np.number)),
(OneHotEncoder(sparse=False),
make_column_selector(dtype_include=object)),
)
X = preprocessor.fit_transform(X)
avg_lot_size = round(home_data['LotArea'].mean())
import datetime
newest_home_age = datetime.datetime.now().year-home_data['YearBuilt'].max()