数据分析之房价预测(机器学习,sklearn)

比赛给的数据集给与的是关于某个地区的房屋数据,要求运用机器学习的知识给出合理的房价预测

首先导入所需要的包

import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
sns.set_style('whitegrid',{'font.sans-serif':['simhei','Arial']}) #这条是为了显示中文出现乱码错误
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import linear_model, svm, gaussian_process
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
from sklearn import cross_validation, metrics
from sklearn.linear_model import LinearRegression
c:\program files\python36\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
train_data=pd.read_csv('./train.csv')
test_data=pd.read_csv('./test.csv')
train_data.head()
时间 小区名 小区房屋出租数量 楼层 总楼层 房屋面积 房屋朝向 居住状态 卧室数量 厅的数量 卫的数量 出租方式 位置 地铁线路 地铁站点 距离 装修情况 月租金
0 1 3072 0.128906 2 0.236364 0.008628 东南 NaN 1 1 1 NaN 11.0 118.0 2.0 40.0 0.764167 NaN 5.602716
1 1 3152 0.132812 1 0.381818 0.017046 NaN 1 0 0 NaN 10.0 100.0 4.0 58.0 0.709167 NaN 16.977929
2 1 5575 0.042969 0 0.290909 0.010593 东南 NaN 2 1 2 NaN 12.0 130.0 5.0 37.0 0.572500 NaN 8.998302
3 1 3103 0.085938 2 0.581818 0.019199 NaN 3 2 2 NaN 7.0 90.0 2.0 63.0 0.658333 NaN 5.602716
4 1 5182 0.214844 0 0.545455 0.010427 东北 NaN 2 1 1 NaN 3.0 31.0 NaN NaN NaN NaN 7.300509
train_data.describe()
时间 小区名 小区房屋出租数量 楼层 总楼层 房屋面积 居住状态 卧室数量 厅的数量 卫的数量 出租方式 位置 地铁线路 地铁站点 距离 装修情况 月租金
count 196539.000000 196539.000000 195538.000000 196539.000000 196539.000000 196539.000000 20138.000000 196539.000000 196539.000000 196539.000000 24230.000000 196508.000000 196508.000000 91778.000000 91778.000000 91778.000000 18492.000000 196539.000000
mean 2.115229 3224.116562 0.124151 0.955449 0.408711 0.013139 2.725196 2.236635 1.299625 1.223818 0.900289 7.905139 67.945982 3.284850 57.493735 0.551202 3.589228 7.949313
std 0.786980 2023.073726 0.133299 0.851511 0.183100 0.008104 0.667763 0.896961 0.613169 0.487234 0.299621 4.025696 43.522394 1.477147 35.191414 0.247268 1.996912 6.310609
min 1.000000 0.000000 0.007812 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 0.001667 1.000000 0.000000
25% 1.000000 1388.000000 0.039062 0.000000 0.290909 0.009268 3.000000 2.000000 1.000000 1.000000 1.000000 4.000000 33.000000 2.000000 23.000000 0.356667 2.000000 4.923599
50% 2.000000 3086.000000 0.082031 1.000000 0.418182 0.012910 3.000000 2.000000 1.000000 1.000000 1.000000 9.000000 61.000000 4.000000 59.000000 0.554167 2.000000 6.621392
75% 3.000000 5199.000000 0.160156 2.000000 0.563636 0.014896 3.000000 3.000000 2.000000 1.000000 1.000000 11.000000 103.000000 5.000000 87.000000 0.745833 6.000000 8.998302
max 3.000000 6627.000000 1.000000 2.000000 1.000000 1.000000 3.000000 11.000000 8.000000 8.000000 1.000000 14.000000 152.000000 5.000000 119.000000 1.000000 6.000000 100.000000
train_data.info()

RangeIndex: 196539 entries, 0 to 196538
Data columns (total 19 columns):
时间          196539 non-null int64
小区名         196539 non-null int64
小区房屋出租数量    195538 non-null float64
楼层          196539 non-null int64
总楼层         196539 non-null float64
房屋面积        196539 non-null float64
房屋朝向        196539 non-null object
居住状态        20138 non-null float64
卧室数量        196539 non-null int64
厅的数量        196539 non-null int64
卫的数量        196539 non-null int64
出租方式        24230 non-null float64
区           196508 non-null float64
位置          196508 non-null float64
地铁线路        91778 non-null float64
地铁站点        91778 non-null float64
距离          91778 non-null float64
装修情况        18492 non-null float64
月租金         196539 non-null float64
dtypes: float64(12), int64(6), object(1)
memory usage: 28.5+ MB
corrmat = train_data.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corrmat, vmax=0.8, square=True)

train_data=pd.read_csv('./train.csv')
test_data=pd.read_csv('./test.csv')
train_data=pd.DataFrame(train_data).fillna(0)
result=train_data['月租金'].values.reshape(-1,1)
del train_data['房屋朝向']
del test_data['房屋朝向']
# del train_data['月租金']
train_data=train_data.values
test_data=test_data.fillna(0).values
lr=LinearRegression()
lr.fit(train_data,result)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
train_data.shape
(196539, 18)
prediction=lr.predict(test_data)
prediction
array([[ 6.00000000e+00],
       [-3.05193784e-11],
       [-1.35930288e-11],
       ...,
       [-5.85045654e-11],
       [-1.61906537e-10],
       [-1.27143911e-10]])
Del_columns = ['时间', '小区名', '小区房屋出租数量', '居住状态', '月租金', '装修情况']
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
for i in Del_columns:
    del train_data[i]
    if i!='月租金':
        del test_data[i]
del test_data['id']
train_data.head()
楼层 总楼层 房屋面积 房屋朝向 卧室数量 厅的数量 卫的数量 出租方式 位置 地铁线路 地铁站点 距离
0 2 0.236364 0.008628 东南 1 1 1 NaN 11.0 118.0 2.0 40.0 0.764167
1 1 0.381818 0.017046 1 0 0 NaN 10.0 100.0 4.0 58.0 0.709167
2 0 0.290909 0.010593 东南 2 1 2 NaN 12.0 130.0 5.0 37.0 0.572500
3 2 0.581818 0.019199 3 2 2 NaN 7.0 90.0 2.0 63.0 0.658333
4 0 0.545455 0.010427 东北 2 1 1 NaN 3.0 31.0 NaN NaN NaN
test_data.head()
楼层 总楼层 房屋面积 房屋朝向 卧室数量 厅的数量 卫的数量 出租方式 位置 地铁线路 地铁站点 距离
0 1 0.600000 0.007117 2 1 1 1.0 10.0 5.0 NaN NaN NaN
1 1 0.472727 0.007448 2 1 1 NaN 3.0 0.0 NaN NaN NaN
2 2 0.709091 0.014068 东南 3 2 2 NaN 10.0 9.0 4.0 74.0 0.400833
3 0 0.090909 0.008937 2 1 1 NaN 6.0 96.0 5.0 17.0 0.384167
4 1 0.218182 0.008606 东南 2 1 1 NaN 6.0 61.0 3.0 114.0 0.598333
def split_map(str):
    return str.strip().split(' ')
def check_bool(arr,str):
    bool_list=[]
    for i in arr:
        if str in i:
            bool_list.append(True)
        else:
            bool_list.append(False)
    return bool_list
def processData(data):
    temp=data['房屋朝向'].map(lambda x:split_map(x))
    data['东']=0
    data['南']=0
    data['西']=0
    data['北']=0
    data['东南']=0
    data['东北']=0
    data['西南']=0
    data['西北']=0
    bool_dong=check_bool(temp,'东')
    bool_nan=check_bool(temp,'南')
    bool_xi=check_bool(temp,'西')
    bool_bei=check_bool(temp,'北')
    bool_db=check_bool(temp,'东南')
    bool_dn=check_bool(temp,'东北')
    bool_xb=check_bool(temp,'西南')
    bool_xn=check_bool(temp,'西北')
    data.loc[bool_dong,'东']=1
    data.loc[bool_xi,'西']=1
    data.loc[bool_nan,'南']=1
    data.loc[bool_bei,'北']=1
    data.loc[bool_db,'东北']=1
    data.loc[bool_dn,'东南']=1
    data.loc[bool_xb,'西北']=1
    data.loc[bool_xn,'西南']=1
    del data['房屋朝向']
    return data
train_data=processData(train_data)
test_data=processData(test_data)
train_data.head()
楼层 总楼层 房屋面积 卧室数量 厅的数量 卫的数量 出租方式 位置 地铁线路 地铁站点 距离 西 东南 东北 西南 西北
0 2 0.236364 0.008628 1 1 1 NaN 11.0 118.0 2.0 40.0 0.764167 0 0 0 0 0 1 0 0
1 1 0.381818 0.017046 1 0 0 NaN 10.0 100.0 4.0 58.0 0.709167 1 0 0 0 0 0 0 0
2 0 0.290909 0.010593 2 1 2 NaN 12.0 130.0 5.0 37.0 0.572500 0 0 0 0 0 1 0 0
3 2 0.581818 0.019199 3 2 2 NaN 7.0 90.0 2.0 63.0 0.658333 0 1 0 0 0 0 0 0
4 0 0.545455 0.010427 2 1 1 NaN 3.0 31.0 NaN NaN NaN 0 0 0 0 1 0 0 0
test_data.head()
楼层 总楼层 房屋面积 卧室数量 厅的数量 卫的数量 出租方式 位置 地铁线路 地铁站点 距离 西 东南 东北 西南 西北
0 1 0.600000 0.007117 2 1 1 1.0 10.0 5.0 NaN NaN NaN 1 0 0 0 0 0 0 0
1 1 0.472727 0.007448 2 1 1 NaN 3.0 0.0 NaN NaN NaN 1 0 0 0 0 0 0 0
2 2 0.709091 0.014068 3 2 2 NaN 10.0 9.0 4.0 74.0 0.400833 0 0 0 0 0 1 0 0
3 0 0.090909 0.008937 2 1 1 NaN 6.0 96.0 5.0 17.0 0.384167 0 1 0 0 0 0 0 0
4 1 0.218182 0.008606 2 1 1 NaN 6.0 61.0 3.0 114.0 0.598333 0 0 0 0 0 1 0 0
train_data=train_data.fillna(0)
test_data=train_data.fillna(0)
train_data=train_data.values
test_data=test_data.values
train_result=pd.read_csv('train.csv')['月租金'].values

用xgbooster模型进行训练

# xbox
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_data, train_result, test_size=0.1,
                                                                     random_state=2333)

xgb_val = xgb.DMatrix(X_test, label=y_test)
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_test = xgb.DMatrix(test_data)
# xgbooster
params = {
    'booster': 'gbtree',
    'objective': 'reg:linear',  # 多分类的问题
    'n_estimators': 2000,
    'gamma': 0.2,  # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
    'max_depth': 10,  # 构建树的深度,越大越容易过拟合
    "reg_alpha": 3,
    'lambda': 5,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
    'subsample': 0.9,  # 随机采样训练样本
    'colsample_bytree': 0.6,  # 生成树时进行的列采样
    'colsample_bylevel': 0.7,
    'min_child_weight': 7,
    # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
    # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
    'silent': 1,  # 设置成1则没有运行信息输出,最好是设置为0.
    'eta': 0.05,  # 如同学习率  0.007
    'seed': 2017,

}

plst = list(params.items())
num_rounds = 10000  # 迭代次数
watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]

# 训练模型并保存
# early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=300, verbose_eval=50, )
model.save_model('xgb.model')  # 用于存储训练出的模型
print("模型训练完成")

print("训练完毕,开始预测")
test_result = model.predict(xgb_test, ntree_limit=model.best_ntree_limit)

data_df = pd.DataFrame(test_result)
filename = 'result_xgb.csv'
data_df.to_csv(filename, encoding='utf-8')
[0]	train-rmse:9.3703	val-rmse:9.49998
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 300 rounds.
[50]	train-rmse:2.77386	val-rmse:2.95125
[100]	train-rmse:2.22017	val-rmse:2.42123
[150]	train-rmse:2.05228	val-rmse:2.26014
[200]	train-rmse:1.95506	val-rmse:2.16923
[250]	train-rmse:1.87808	val-rmse:2.09895
[300]	train-rmse:1.81004	val-rmse:2.03828
[350]	train-rmse:1.75105	val-rmse:1.98529
[400]	train-rmse:1.69493	val-rmse:1.93967
[450]	train-rmse:1.64521	val-rmse:1.90085
[500]	train-rmse:1.59184	val-rmse:1.85845
[550]	train-rmse:1.54979	val-rmse:1.82564
[600]	train-rmse:1.51416	val-rmse:1.79824
[650]	train-rmse:1.476	val-rmse:1.76895
[700]	train-rmse:1.44457	val-rmse:1.74357
[750]	train-rmse:1.41372	val-rmse:1.72097
[800]	train-rmse:1.3867	val-rmse:1.69965
[850]	train-rmse:1.36016	val-rmse:1.67913
[900]	train-rmse:1.3367	val-rmse:1.6619
[950]	train-rmse:1.31308	val-rmse:1.64355
[1000]	train-rmse:1.2914	val-rmse:1.62736
[1050]	train-rmse:1.27203	val-rmse:1.61333
[1100]	train-rmse:1.2507	val-rmse:1.59796
[1150]	train-rmse:1.23164	val-rmse:1.58435
[1200]	train-rmse:1.21557	val-rmse:1.57225
[1250]	train-rmse:1.20111	val-rmse:1.56099
[1300]	train-rmse:1.18301	val-rmse:1.54762
[1350]	train-rmse:1.1699	val-rmse:1.53814
[1400]	train-rmse:1.15637	val-rmse:1.52793
[1450]	train-rmse:1.14323	val-rmse:1.51842
[1500]	train-rmse:1.13196	val-rmse:1.51052
[1550]	train-rmse:1.11936	val-rmse:1.50133
[1600]	train-rmse:1.10928	val-rmse:1.49404
[1650]	train-rmse:1.09593	val-rmse:1.48425
[1700]	train-rmse:1.08563	val-rmse:1.4771
[1750]	train-rmse:1.07649	val-rmse:1.47056
[1800]	train-rmse:1.06619	val-rmse:1.46278
[1850]	train-rmse:1.05751	val-rmse:1.45715
[1900]	train-rmse:1.04786	val-rmse:1.45038
[1950]	train-rmse:1.03764	val-rmse:1.44344
[2000]	train-rmse:1.02981	val-rmse:1.43775
[2050]	train-rmse:1.02199	val-rmse:1.43245
[2100]	train-rmse:1.01467	val-rmse:1.42699
[2150]	train-rmse:1.00615	val-rmse:1.42124
[2200]	train-rmse:0.997202	val-rmse:1.41497
[2250]	train-rmse:0.99015	val-rmse:1.41046
[2300]	train-rmse:0.98291	val-rmse:1.40615
[2350]	train-rmse:0.976352	val-rmse:1.40207
[2400]	train-rmse:0.969466	val-rmse:1.39734
[2450]	train-rmse:0.964016	val-rmse:1.39344
[2500]	train-rmse:0.958046	val-rmse:1.38977
[2550]	train-rmse:0.952188	val-rmse:1.38579
[2600]	train-rmse:0.945662	val-rmse:1.38188
[2650]	train-rmse:0.939815	val-rmse:1.37778
[2700]	train-rmse:0.934951	val-rmse:1.37506
[2750]	train-rmse:0.92956	val-rmse:1.37186
[2800]	train-rmse:0.925032	val-rmse:1.36926
[2850]	train-rmse:0.920306	val-rmse:1.36628
[2900]	train-rmse:0.915933	val-rmse:1.36362
[2950]	train-rmse:0.912063	val-rmse:1.36091
[3000]	train-rmse:0.907578	val-rmse:1.3581
[3050]	train-rmse:0.902738	val-rmse:1.35514
[3100]	train-rmse:0.899267	val-rmse:1.35305
[3150]	train-rmse:0.895539	val-rmse:1.35101
[3200]	train-rmse:0.892583	val-rmse:1.34929
[3250]	train-rmse:0.888493	val-rmse:1.3464
[3300]	train-rmse:0.885463	val-rmse:1.34454
[3350]	train-rmse:0.882028	val-rmse:1.34274
[3400]	train-rmse:0.87818	val-rmse:1.34027
[3450]	train-rmse:0.875323	val-rmse:1.33839
[3500]	train-rmse:0.871904	val-rmse:1.33632
[3550]	train-rmse:0.868129	val-rmse:1.33422
[3600]	train-rmse:0.865097	val-rmse:1.33224
[3650]	train-rmse:0.862143	val-rmse:1.33029
[3700]	train-rmse:0.8593	val-rmse:1.32906
[3750]	train-rmse:0.85648	val-rmse:1.32757
[3800]	train-rmse:0.854131	val-rmse:1.3263
[3850]	train-rmse:0.851365	val-rmse:1.32452
[3900]	train-rmse:0.848457	val-rmse:1.32238
[3950]	train-rmse:0.845943	val-rmse:1.32112
[4000]	train-rmse:0.843617	val-rmse:1.31984
[4050]	train-rmse:0.84074	val-rmse:1.3184
[4100]	train-rmse:0.838503	val-rmse:1.31692
[4150]	train-rmse:0.836258	val-rmse:1.31524
[4200]	train-rmse:0.83361	val-rmse:1.31377
[4250]	train-rmse:0.831158	val-rmse:1.31241
[4300]	train-rmse:0.828668	val-rmse:1.31086
[4350]	train-rmse:0.82635	val-rmse:1.30916
[4400]	train-rmse:0.824288	val-rmse:1.30784
[4450]	train-rmse:0.822547	val-rmse:1.30679
[4500]	train-rmse:0.820341	val-rmse:1.3061
[4550]	train-rmse:0.818466	val-rmse:1.30479
[4600]	train-rmse:0.816483	val-rmse:1.30372
[4650]	train-rmse:0.814547	val-rmse:1.30289
[4700]	train-rmse:0.812584	val-rmse:1.30213
[4750]	train-rmse:0.810988	val-rmse:1.30125
[4800]	train-rmse:0.809294	val-rmse:1.30042
[4850]	train-rmse:0.807515	val-rmse:1.29971
[4900]	train-rmse:0.805787	val-rmse:1.29873
[4950]	train-rmse:0.804336	val-rmse:1.29787
[5000]	train-rmse:0.802841	val-rmse:1.29688
[5050]	train-rmse:0.801239	val-rmse:1.29602
[5100]	train-rmse:0.799495	val-rmse:1.29525
[5150]	train-rmse:0.797696	val-rmse:1.29396
[5200]	train-rmse:0.796141	val-rmse:1.29346
[5250]	train-rmse:0.794911	val-rmse:1.29268
[5300]	train-rmse:0.793145	val-rmse:1.29126
[5350]	train-rmse:0.791934	val-rmse:1.29083
[5400]	train-rmse:0.790233	val-rmse:1.28989
[5450]	train-rmse:0.788694	val-rmse:1.28888
[5500]	train-rmse:0.786967	val-rmse:1.28807
[5550]	train-rmse:0.785497	val-rmse:1.28732
[5600]	train-rmse:0.784078	val-rmse:1.28612
[5650]	train-rmse:0.782551	val-rmse:1.28536
[5700]	train-rmse:0.781073	val-rmse:1.28481
[5750]	train-rmse:0.779708	val-rmse:1.28382
[5800]	train-rmse:0.778309	val-rmse:1.28302
[5850]	train-rmse:0.776918	val-rmse:1.28268
[5900]	train-rmse:0.775676	val-rmse:1.28189
[5950]	train-rmse:0.774513	val-rmse:1.28112
[6000]	train-rmse:0.773367	val-rmse:1.28045
[6050]	train-rmse:0.772218	val-rmse:1.27994
[6100]	train-rmse:0.771067	val-rmse:1.27933
[6150]	train-rmse:0.7696	val-rmse:1.27846
[6200]	train-rmse:0.768351	val-rmse:1.27754
[6250]	train-rmse:0.767365	val-rmse:1.27706
[6300]	train-rmse:0.766297	val-rmse:1.27632
[6350]	train-rmse:0.765223	val-rmse:1.27568
[6400]	train-rmse:0.764237	val-rmse:1.27524
[6450]	train-rmse:0.762684	val-rmse:1.2743
[6500]	train-rmse:0.761282	val-rmse:1.27319
[6550]	train-rmse:0.760169	val-rmse:1.27264
[6600]	train-rmse:0.759052	val-rmse:1.27181
[6650]	train-rmse:0.75798	val-rmse:1.2713
[6700]	train-rmse:0.756919	val-rmse:1.27098
[6750]	train-rmse:0.756039	val-rmse:1.27034



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

 in ()
     34 # 训练模型并保存
     35 # early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
---> 36 model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=300, verbose_eval=50, )
     37 model.save_model('xgb.model')  # 用于存储训练出的模型
     38 print("模型训练完成")


c:\program files\python36\lib\site-packages\xgboost\training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)
    202                            evals=evals,
    203                            obj=obj, feval=feval,
--> 204                            xgb_model=xgb_model, callbacks=callbacks)
    205 
    206 


c:\program files\python36\lib\site-packages\xgboost\training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     72         # Skip the first update if it is a recovery step.
     73         if version % 2 == 0:
---> 74             bst.update(dtrain, i, obj)
     75             bst.save_rabit_checkpoint()
     76             version += 1


c:\program files\python36\lib\site-packages\xgboost\core.py in update(self, dtrain, iteration, fobj)
   1019         if fobj is None:
   1020             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),
-> 1021                                                     dtrain.handle))
   1022         else:
   1023             pred = self.predict(dtrain)


KeyboardInterrupt: 
  • 2月12日 补充
    在房屋朝向方面,可以运用更简单的方法来进行处理:
    数据分析之房价预测(机器学习,sklearn)_第1张图片
    数据分析之房价预测(机器学习,sklearn)_第2张图片

你可能感兴趣的:(数据分析,机器学习)