提高成单量
促成售卖,促成租赁
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
house = pd.read_csv('./二手房数据/house.csv')
community = pd.read_csv('./二手房数据/community_describe.csv')
data = house.merge(community,on='community',how='left')
使用正则提取数字数据
#当前层
data['当前层'] = data.years.str.extract('(\w*?)\(')
#总楼层
data['总楼层'] = data.years.str.extract('共(\d+)层')
#建成年份
data['建成年份'] = data.years.str.extract('\)(\d+)年')
#建筑结构
data['建筑结构'] = data.years.str.extract('建(\w+)')
del data['years']
del data['floor']
data = data[~data.housetype.str.contains('别|车')]
#卧室
data['卧室'] = data.housetype.str.extract('(\d+)室|房')
#客厅
data['客厅'] = data.housetype.str.extract('(\d+)厅')
#卫生间
data['卫生间'] = data.housetype.str.extract('(\d+)卫')
del data['housetype']
data['square'] = data.square.str.replace('平米','')
data['地铁距离'] = data.taxtype.str.extract('站(\d+)')
data['房本类型'] = data.taxtype.str.extract('满(\w+)年')
del data['taxtype']
data['地铁站'] = data.tagList.str.extract('线(\w+)')
del data['tagList']
data = data.drop(['index_x','title','totalPrice','followInfo','index_y','id','onsale'],axis=1)
data['square'] = data['square'].astype('float64')
data['总楼层'] = data['总楼层'].astype('float64')
data['建成年份'] = data['建成年份'].astype('float64')
data['卧室'] = data['卧室'].astype('float64')
data['客厅'] = data['客厅'].astype('float64')
data['卫生间'] = data['卫生间'].astype('float64')
data['地铁距离'] = data['地铁距离'].astype('float64')
使用掩码调整数据
data = data[data.unitPrice >= 30000]
data = data[data['总楼层'] < 40]
data = data[data['卧室'] < 5]
data = data[data['客厅'] <= 2]
del data['卫生间']
data = data[data['当前层'] != '地下室']
data = data[data['建筑结构'] != '平房']
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15255 entries, 0 to 16115
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 community 15255 non-null object
1 square 15255 non-null float64
2 unitPrice 15255 non-null int64
3 district 14188 non-null object
4 bizcircle 14188 non-null object
5 当前层 15255 non-null object
6 总楼层 15255 non-null float64
7 建成年份 15221 non-null float64
8 建筑结构 15145 non-null object
9 卧室 15255 non-null float64
10 客厅 15255 non-null float64
11 地铁距离 10117 non-null float64
12 房本类型 12821 non-null object
13 地铁站 10655 non-null object
dtypes: float64(6), int64(1), object(7)
memory usage: 1.7+ MB
# district缺失值删掉
data = data[~data.district.isna()]
#地铁距离的缺失值,使用最大值填充
data.地铁距离[data.地铁距离.isna()] = data.地铁距离.max()
#房本类型缺失值,认为不满2年,使用1填充
data.房本类型[data.房本类型.isna()] = 1
#地铁站的缺失值,填充无
data.地铁站[data.地铁站.isna()] = '无'
#建成年份的缺失值,使用同小区的众数,进行填充。填充失败的直接删除
def getyears(items):
val = data.建成年份[data.community == items].mode()
if val.size > 0:
return val[0]
else:
return np.nan
data.建成年份[data.建成年份.isna()] = data.community[data.建成年份.isna()].apply(getyears)
# 填充失败的,直接删掉
data.dropna(subset=['建成年份'],inplace=True)
#建筑结构的缺失值,使用同小区的众数,进行填充。填充失败的直接删除
def getyears(items):
val = data.建筑结构[data.community == items].mode()
if val.size > 0:
return val[0]
else:
return np.nan
data.建筑结构[data.建筑结构.isna()] = data.community[data.建筑结构.isna()].apply(getyears)
# 填充失败的,直接删掉
data.dropna(subset=['建筑结构'],inplace=True)
#district
dis_onehot = pd.get_dummies(data.district) #进行独热编码
data = pd.concat((data,dis_onehot),axis=1)
del data['district']
# 当前层
# data.当前层.unique()
dic_floor = {'底层':1, '低楼层':2, '中楼层':3, '顶层':4, '高楼层':5}
data['当前层'] = data.当前层.map(dic_floor)
# 建筑结构
# data.建筑结构.unique()
dic_build = {'板塔结合':1, '塔楼':2, '板楼':3}
data['建筑结构'] = data.建筑结构.map(dic_build)
# 房本类型
# data.房本类型.unique()
dic_housebook = {'五':5, '两':2, 1:1}
data['房本类型'] = data.房本类型.map(dic_housebook)
# bizcircle
dic_biz = dict(data.bizcircle.value_counts())
data['bizcircle'] = data.bizcircle.map(dic_biz)
# 地铁站
dic_sub = dict(data.groupby('地铁站')['unitPrice'].mean())
data['地铁站'] = data.地铁站.map(dic_sub)
# community
dic_com = dict(data.groupby('community')['unitPrice'].mean())
data['community'] = data.community.map(dic_com)
# 将unitPrice放到最后一列
data['y'] = data['unitPrice']
del data['unitPrice']
import sklearn.model_selection as ms #模型选择
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
train_x,\
test_x,\
train_y,\
test_y = ms.train_test_split(x,y,
test_size=0.1,
random_state=7)
#回归
def select_model(name,model):
pass
dic_model = {'模型名':模型对象}
for name,obj in dic_model.items():
select_model(name,obj)
例子:
def select_model(name, model):
print('--------', name, '----------')
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))
model_dict = {'单颗决策树': st.DecisionTreeClassifier(),
'Adaboost': se.AdaBoostClassifier(st.DecisionTreeClassifier(),
n_estimators=100),
'GBDT': se.GradientBoostingClassifier(n_estimators=100),
'随机森林': se.RandomForestClassifier(n_estimators=100)}
for name, obj in model_dict.items():
select_model(name, obj)