结构化数据的 回归问题
import pandas as pd
import numpy as np
train=pd.read_csv('datas/los_data.csv') # 训练集的读取
train.get_dtype_counts()
float64 3
int64 35
object 43
dtype: int64
train.head(3)
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 rows × 81 columns
train.info() # 类型统计
train.describe() # 类型描述
Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1460.000000 | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | ... | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
mean | 730.500000 | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | ... | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
std | 421.610009 | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | ... | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
25% | 365.750000 | 20.000000 | 59.000000 | 7553.500000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129975.000000 |
50% | 730.500000 | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | ... | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
75% | 1095.250000 | 70.000000 | 80.000000 | 11601.500000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.000000 | 712.250000 | ... | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
max | 1460.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | ... | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
8 rows × 38 columns
如何统计数据的缺失率?
train.isnull().head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | True | False | False | False | ... | False | True | True | True | False | False | False | False | False | False |
1 | False | False | False | False | False | False | True | False | False | False | ... | False | True | True | True | False | False | False | False | False | False |
2 | False | False | False | False | False | False | True | False | False | False | ... | False | True | True | True | False | False | False | False | False | False |
3 | False | False | False | False | False | False | True | False | False | False | ... | False | True | True | True | False | False | False | False | False | False |
4 | False | False | False | False | False | False | True | False | False | False | ... | False | True | True | True | False | False | False | False | False | False |
5 rows × 81 columns
nullSum = train.isnull().sum()
train.isnull().sum()
tmp = train.isnull().sum()
nullSum_data = tmp[tmp>0]
tmp[tmp>0].shape
(19,)
nullSum_data
LotFrontage 259
Alley 1369
MasVnrType 8
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406
dtype: int64
[i for i in nullSum_data]
[259,
1369,
8,
8,
37,
37,
38,
37,
38,
1,
690,
81,
81,
81,
81,
81,
1453,
1179,
1406]
[i/len(train) for i in nullSum_data]
[0.1773972602739726,
0.9376712328767123,
0.005479452054794521,
0.005479452054794521,
0.025342465753424658,
0.025342465753424658,
0.026027397260273973,
0.025342465753424658,
0.026027397260273973,
0.0006849315068493151,
0.4726027397260274,
0.05547945205479452,
0.05547945205479452,
0.05547945205479452,
0.05547945205479452,
0.05547945205479452,
0.9952054794520548,
0.8075342465753425,
0.963013698630137]
tmp=train.isnull().sum().sort_values()
tmp[tmp>0]
Electrical 1
MasVnrType 8
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtFinType1 37
BsmtFinType2 38
BsmtExposure 38
GarageQual 81
GarageFinish 81
GarageYrBlt 81
GarageType 81
GarageCond 81
LotFrontage 259
FireplaceQu 690
Fence 1179
Alley 1369
MiscFeature 1406
PoolQC 1453
dtype: int64
train1=train
train1.mean().head()
# train1.median().head()
Id 730.500000
MSSubClass 56.897260
LotFrontage 70.049958
LotArea 10516.828082
OverallQual 6.099315
dtype: float64
# fillna函数会找到对应列的均值或者是中位数,对于该列进行相应的填充
train1 = train1.fillna(train1.mean())
# 空值统计函数
def getNullCount(train):
tmp = train.isnull().sum()
print(tmp[tmp>0].shape)
getNullCount(train1)
(16,)
朴素办法:将NaN这种空类型变成“None”或者是“NA”的字段
train1.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
train1=train1.fillna("None")
train1.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | None | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | None | Reg | Lvl | AllPub | ... | 0 | None | None | None | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | None | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | None | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | None | IR1 | Lvl | AllPub | ... | 0 | None | None | None | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
getNullCount(train1) # 填充完毕
(0,)
1.y标签
2.数据里面有Id?pandas伪造出来的
3.答案分出来了,还把答案放进去??
y = train1['SalePrice']# 标签label
train1 = train1.drop(['Id', 'SalePrice'],axis=1)
train1.head()
MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | ... | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 60 | RL | 65.0 | 8450 | Pave | None | Reg | Lvl | AllPub | Inside | ... | 0 | 0 | None | None | None | 0 | 2 | 2008 | WD | Normal |
1 | 20 | RL | 80.0 | 9600 | Pave | None | Reg | Lvl | AllPub | FR2 | ... | 0 | 0 | None | None | None | 0 | 5 | 2007 | WD | Normal |
2 | 60 | RL | 68.0 | 11250 | Pave | None | IR1 | Lvl | AllPub | Inside | ... | 0 | 0 | None | None | None | 0 | 9 | 2008 | WD | Normal |
3 | 70 | RL | 60.0 | 9550 | Pave | None | IR1 | Lvl | AllPub | Corner | ... | 0 | 0 | None | None | None | 0 | 2 | 2006 | WD | Abnorml |
4 | 60 | RL | 84.0 | 14260 | Pave | None | IR1 | Lvl | AllPub | FR2 | ... | 0 | 0 | None | None | None | 0 | 12 | 2008 | WD | Normal |
5 rows × 79 columns
X = pd.get_dummies(train1) # 结构化数据
X.head()
MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | ... | SaleType_ConLw | SaleType_New | SaleType_Oth | SaleType_WD | SaleCondition_Abnorml | SaleCondition_AdjLand | SaleCondition_Alloca | SaleCondition_Family | SaleCondition_Normal | SaleCondition_Partial | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 60 | 65.0 | 8450 | 7 | 5 | 2003 | 2003 | 196.0 | 706 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 20 | 80.0 | 9600 | 6 | 8 | 1976 | 1976 | 0.0 | 978 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 60 | 68.0 | 11250 | 7 | 5 | 2001 | 2002 | 162.0 | 486 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 70 | 60.0 | 9550 | 7 | 5 | 1915 | 1970 | 0.0 | 216 | 0 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 60 | 84.0 | 14260 | 8 | 5 | 2000 | 2000 | 350.0 | 655 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 303 columns
X.shape
(1460, 303)
分一些数据做评测
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)
X_train.shape
(1168, 303)
X_test.shape
(292, 303)
import xgboost as xgb
xg_reg=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.6,learning_rate=0.01,max_depth=8,alpha=10,n_estimators=700,sum_sample=0.7)
xg_reg.fit(X_train, y_train)
XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=0.6, gamma=0, importance_type='gain',
learning_rate=0.01, max_delta_step=0, max_depth=8,
min_child_weight=1, missing=None, n_estimators=700, n_jobs=1,
nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
subsample=1, sum_sample=0.7)
from sklearn.metrics import mean_squared_error
pred=xg_reg.predict(X_test)
rmse=np.sqrt(mean_squared_error(y_test,pred))
rmse
26348.6994479421
logrmse=np.sqrt(mean_squared_error(np.log(y_test),np.log(pred)))
logrmse
0.11996319328290858
xg_reg2=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.6,learning_rate=0.01,max_depth=5,alpha=10,n_estimators=3000,subsample=0.7,random_state=123)
xg_reg2.fit(X_train, y_train)
XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=0.6, gamma=0, importance_type='gain',
learning_rate=0.01, max_delta_step=0, max_depth=5,
min_child_weight=1, missing=None, n_estimators=3000, n_jobs=1,
nthread=None, objective='reg:linear', random_state=123, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
subsample=0.7)
pred2=xg_reg2.predict(X_test)
logrmse=np.sqrt(mean_squared_error(np.log(y_test),np.log(pred2)))
logrmse
0.10447489682428332
pd.get_dummies(train1).shape
(1460, 303)
# gs=GridSearch(xg_reg,{
# "n_estimators":[100,500,1000,3000],
# "alpha":[0.01,0.1,1.0,10]
# "lambda":[...]
# })
m*n 不重要的点过滤掉
# rs=RandomizedSearch(xg_reg,{
# "n_estimators":[100,500,1000,3000],
# "alpha":np.norm(1.0,0.7),
# "lambda":[...]
# })
params = {
"objective":"reg:linear",'colsample_bytree': 0.7,'learning_rate': 0.1,
'max_depth': 5, 'alpha': 10}
#xgboost可以接受的Data的一种压缩后的数据结构,.lmdb/.h5,基于此可以对训练集的读写进行优化提升训练速度
matrix=xgb.DMatrix(data=X,label=y)
# 本身数据不够用,用10折交叉
# 没必要配测试集了,用量数据进行训练和评测,再平均
X.shape
(1460, 303)
cv_results=xgb.cv(dtrain=matrix,params=params,nfold=10,num_boost_round=500,
metrics='rmse',as_pandas=True, verbose_eval=False)
#决策树的学习过程分为2个阶段,分裂和剪枝(前剪枝和后减枝) 都是为了防止过拟合
# verbose_eval把日志关了
type(cv_results)
pandas.core.frame.DataFrame
X=pd.get_dummies(train1) # Convert categorical variable into dummy/indicator variables
# 进行
cv_results.tail()
test-rmse-mean | test-rmse-std | train-rmse-mean | train-rmse-std | |
---|---|---|---|---|
495 | 26897.374219 | 6992.056793 | 1675.321265 | 103.741632 |
496 | 26897.469727 | 6991.416821 | 1667.588147 | 104.618191 |
497 | 26897.067188 | 6991.073168 | 1661.767676 | 105.295643 |
498 | 26897.811133 | 6991.131930 | 1656.616650 | 105.007822 |
499 | 26897.296680 | 6991.362384 | 1650.411279 | 106.048964 |