有监督模型
采用的决策树回归模型
import pandas as pd
melbourne_file_path =r'G:\kaggle\melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.columns
Index([u'Suburb', u'Address', u'Rooms', u'Type', u'Price', u'Method',
u'SellerG', u'Date', u'Distance', u'Postcode', u'Bedroom2', u'Bathroom',
u'Car', u'Landsize', u'BuildingArea', u'YearBuilt', u'CouncilArea',
u'Lattitude', u'Longtitude', u'Regionname', u'Propertycount'],
dtype='object')
melbourne_data.head()
Suburb | Address | Rooms | Type | Price | Method | SellerG | Date | Distance | Postcode | ... | Bathroom | Car | Landsize | BuildingArea | YearBuilt | CouncilArea | Lattitude | Longtitude | Regionname | Propertycount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Abbotsford | 85 Turner St | 2 | h | 1480000.0 | S | Biggin | 3/12/2016 | 2.5 | 3067.0 | ... | 1.0 | 1.0 | 202.0 | NaN | NaN | Yarra | -37.7996 | 144.9984 | Northern Metropolitan | 4019.0 |
1 | Abbotsford | 25 Bloomburg St | 2 | h | 1035000.0 | S | Biggin | 4/02/2016 | 2.5 | 3067.0 | ... | 1.0 | 0.0 | 156.0 | 79.0 | 1900.0 | Yarra | -37.8079 | 144.9934 | Northern Metropolitan | 4019.0 |
2 | Abbotsford | 5 Charles St | 3 | h | 1465000.0 | SP | Biggin | 4/03/2017 | 2.5 | 3067.0 | ... | 2.0 | 0.0 | 134.0 | 150.0 | 1900.0 | Yarra | -37.8093 | 144.9944 | Northern Metropolitan | 4019.0 |
3 | Abbotsford | 40 Federation La | 3 | h | 850000.0 | PI | Biggin | 4/03/2017 | 2.5 | 3067.0 | ... | 2.0 | 1.0 | 94.0 | NaN | NaN | Yarra | -37.7969 | 144.9969 | Northern Metropolitan | 4019.0 |
4 | Abbotsford | 55a Park St | 4 | h | 1600000.0 | VB | Nelson | 4/06/2016 | 2.5 | 3067.0 | ... | 1.0 | 2.0 | 120.0 | 142.0 | 2014.0 | Yarra | -37.8072 | 144.9941 | Northern Metropolitan | 4019.0 |
5 rows × 21 columns
#多少条记录
len(melbourne_data)
13580
melbourne_data.describe()
#从count看出,有很多缺失值,比如BuildingArea有6452个NaN
Rooms | Price | Distance | Postcode | Bedroom2 | Bathroom | Car | Landsize | BuildingArea | YearBuilt | Lattitude | Longtitude | Propertycount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 13580.000000 | 1.358000e+04 | 13580.000000 | 13580.000000 | 13580.000000 | 13580.000000 | 13518.000000 | 13580.000000 | 7130.000000 | 8205.000000 | 13580.000000 | 13580.000000 | 13580.000000 |
mean | 2.937997 | 1.075684e+06 | 10.137776 | 3105.301915 | 2.914728 | 1.534242 | 1.610075 | 558.416127 | 151.967650 | 1964.684217 | -37.809203 | 144.995216 | 7454.417378 |
std | 0.955748 | 6.393107e+05 | 5.868725 | 90.676964 | 0.965921 | 0.691712 | 0.962634 | 3990.669241 | 541.014538 | 37.273762 | 0.079260 | 0.103916 | 4378.581772 |
min | 1.000000 | 8.500000e+04 | 0.000000 | 3000.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1196.000000 | -38.182550 | 144.431810 | 249.000000 |
25% | 2.000000 | 6.500000e+05 | 6.100000 | 3044.000000 | 2.000000 | 1.000000 | 1.000000 | 177.000000 | 93.000000 | 1940.000000 | -37.856822 | 144.929600 | 4380.000000 |
50% | 3.000000 | 9.030000e+05 | 9.200000 | 3084.000000 | 3.000000 | 1.000000 | 2.000000 | 440.000000 | 126.000000 | 1970.000000 | -37.802355 | 145.000100 | 6555.000000 |
75% | 3.000000 | 1.330000e+06 | 13.000000 | 3148.000000 | 3.000000 | 2.000000 | 2.000000 | 651.000000 | 174.000000 | 1999.000000 | -37.756400 | 145.058305 | 10331.000000 |
max | 10.000000 | 9.000000e+06 | 48.100000 | 3977.000000 | 20.000000 | 8.000000 | 10.000000 | 433014.000000 | 44515.000000 | 2018.000000 | -37.408530 | 145.526350 | 21650.000000 |
BuildingArea YearBuilt CouncilArea
48%缺失值 40%缺失值 10%缺失值
#选择删除,删除含有缺失值的记录(行)
#dropna()
melbourne_data=melbourne_data.dropna(axis=0)#默认为删除行 默认how='any' 还有一个how='all',该条记录所有值为Na值时候才删除
melbourne_data.head()
#可以看到index为0、3、5....的有NaN值的记录都被删了
Suburb | Address | Rooms | Type | Price | Method | SellerG | Date | Distance | Postcode | ... | Bathroom | Car | Landsize | BuildingArea | YearBuilt | CouncilArea | Lattitude | Longtitude | Regionname | Propertycount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Abbotsford | 25 Bloomburg St | 2 | h | 1035000.0 | S | Biggin | 4/02/2016 | 2.5 | 3067.0 | ... | 1.0 | 0.0 | 156.0 | 79.0 | 1900.0 | Yarra | -37.8079 | 144.9934 | Northern Metropolitan | 4019.0 |
2 | Abbotsford | 5 Charles St | 3 | h | 1465000.0 | SP | Biggin | 4/03/2017 | 2.5 | 3067.0 | ... | 2.0 | 0.0 | 134.0 | 150.0 | 1900.0 | Yarra | -37.8093 | 144.9944 | Northern Metropolitan | 4019.0 |
4 | Abbotsford | 55a Park St | 4 | h | 1600000.0 | VB | Nelson | 4/06/2016 | 2.5 | 3067.0 | ... | 1.0 | 2.0 | 120.0 | 142.0 | 2014.0 | Yarra | -37.8072 | 144.9941 | Northern Metropolitan | 4019.0 |
6 | Abbotsford | 124 Yarra St | 3 | h | 1876000.0 | S | Nelson | 7/05/2016 | 2.5 | 3067.0 | ... | 2.0 | 0.0 | 245.0 | 210.0 | 1910.0 | Yarra | -37.8024 | 144.9993 | Northern Metropolitan | 4019.0 |
7 | Abbotsford | 98 Charles St | 2 | h | 1636000.0 | S | Nelson | 8/10/2016 | 2.5 | 3067.0 | ... | 1.0 | 2.0 | 256.0 | 107.0 | 1890.0 | Yarra | -37.8060 | 144.9954 | Northern Metropolitan | 4019.0 |
5 rows × 21 columns
melbourne_data.describe()
Rooms | Price | Distance | Postcode | Bedroom2 | Bathroom | Car | Landsize | BuildingArea | YearBuilt | Lattitude | Longtitude | Propertycount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 6196.000000 | 6.196000e+03 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 |
mean | 2.931407 | 1.068828e+06 | 9.751097 | 3101.947708 | 2.902034 | 1.576340 | 1.573596 | 471.006940 | 141.568645 | 1964.081988 | -37.807904 | 144.990201 | 7435.489509 |
std | 0.971079 | 6.751564e+05 | 5.612065 | 86.421604 | 0.970055 | 0.711362 | 0.929947 | 897.449881 | 90.834824 | 38.105673 | 0.075850 | 0.099165 | 4337.698917 |
min | 1.000000 | 1.310000e+05 | 0.000000 | 3000.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1196.000000 | -38.164920 | 144.542370 | 389.000000 |
25% | 2.000000 | 6.200000e+05 | 5.900000 | 3044.000000 | 2.000000 | 1.000000 | 1.000000 | 152.000000 | 91.000000 | 1940.000000 | -37.855438 | 144.926198 | 4383.750000 |
50% | 3.000000 | 8.800000e+05 | 9.000000 | 3081.000000 | 3.000000 | 1.000000 | 1.000000 | 373.000000 | 124.000000 | 1970.000000 | -37.802250 | 144.995800 | 6567.000000 |
75% | 4.000000 | 1.325000e+06 | 12.400000 | 3147.000000 | 3.000000 | 2.000000 | 2.000000 | 628.000000 | 170.000000 | 2000.000000 | -37.758200 | 145.052700 | 10175.000000 |
max | 8.000000 | 9.000000e+06 | 47.400000 | 3977.000000 | 9.000000 | 8.000000 | 10.000000 | 37000.000000 | 3112.000000 | 2018.000000 | -37.457090 | 145.526350 | 21650.000000 |
以房价为预测目标
#以[Price]作为真实的y
y= melbourne_data.Price
作为模型的输入的那些columns叫做“特征”
哪些列影响着房价呢?
有时候,把除了target那一列外的所有columns作为特征
有时候,可能选更少的一些比较好
melbourne_data.columns
Index([u'Suburb', u'Address', u'Rooms', u'Type', u'Price', u'Method',
u'SellerG', u'Date', u'Distance', u'Postcode', u'Bedroom2', u'Bathroom',
u'Car', u'Landsize', u'BuildingArea', u'YearBuilt', u'CouncilArea',
u'Lattitude', u'Longtitude', u'Regionname', u'Propertycount'],
dtype='object')
#an example:选取这些列作为特征
melbourne_features=['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
#By convention,this data is called X
X= melbourne_data[melbourne_features]
print(X.head()) #6189*5
print(y.head()) #6169*1
Rooms Bathroom Landsize Lattitude Longtitude
1 2 1.0 156.0 -37.8079 144.9934
2 3 2.0 134.0 -37.8093 144.9944
4 4 1.0 120.0 -37.8072 144.9941
6 3 2.0 245.0 -37.8024 144.9993
7 2 1.0 256.0 -37.8060 144.9954
1 1035000.0
2 1465000.0
4 1600000.0
6 1876000.0
7 1636000.0
Name: Price, dtype: float64
X.describe()
Rooms | Bathroom | Landsize | Lattitude | Longtitude | |
---|---|---|---|---|---|
count | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 |
mean | 2.931407 | 1.576340 | 471.006940 | -37.807904 | 144.990201 |
std | 0.971079 | 0.711362 | 897.449881 | 0.075850 | 0.099165 |
min | 1.000000 | 1.000000 | 0.000000 | -38.164920 | 144.542370 |
25% | 2.000000 | 1.000000 | 152.000000 | -37.855438 | 144.926198 |
50% | 3.000000 | 1.000000 | 373.000000 | -37.802250 | 144.995800 |
75% | 4.000000 | 2.000000 | 628.000000 | -37.758200 | 145.052700 |
max | 8.000000 | 8.000000 | 37000.000000 | -37.457090 | 145.526350 |
使用机器学习库scikit-learn, 简称sklearn
**Steps:
#用决策树做回归: DecisionTreeRegressor类
# from sklearn import tree
# clf= tree.DecisionTreeRegressor(random_state=1)
# clf.fit(X, y)
# clf.predict([[3,1.0,122.0, -37.8072,144.9941]])
from sklearn.tree import DecisionTreeRegressor
#Define model
melbourne_model= DecisionTreeRegressor(random_state=1)#random_state=1:确保每次运行是同一个结果
#Fit model
melbourne_model.fit( X, y)
#use model to Predict
melbourne_model.predict([[3,1.0,122.0, -37.8072,144.9941]])
array([1200000.])
melbourne_model.predict(X.head())
array([1035000., 1465000., 1600000., 1876000., 1636000.])