-- coding: utf-8 --
"""
Created on Thu Jan 9 11:36:00 2020
@author: QIAOQICHAO258
"""
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import codecs
import csv
import os
from math import radians, cos, sin, asin, sqrt
import math
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import mean_absolute_error
cols_dict =
{'bazhuayu_case.case_type': '案例类型',
'bazhuayu_case.case_source': '案例来源',
'bazhuayu_case.district': '区域',
'bazhuayu_case.community_sources': '案例源小区名称',
'bazhuayu_case.flr_total_ind': '总楼层',
'bazhuayu_case.checked_time': '看房次数',
'bazhuayu_case.attention_time': '关注量',
'bazhuayu_case.browse_time': '浏览次数',
'bazhuayu_case.list_time': '挂牌时间',
'bazhuayu_case.transaction_time': '成交时间',
'bazhuayu_case.list_totalprice': '挂牌总价(万元)',
'bazhuayu_case.list_unitprice': '挂牌单价(元/㎡)',
'bazhuayu_case.transaction_price': '成交总价(万元)',
'bazhuayu_case.transaction_avg_price': '成交单价(元/㎡)',
'bazhuayu_case.community_price': '案例源小区均价(元/㎡)',
'bazhuayu_case.transaction_cycle': '成交周期',
'bazhuayu_case.price_adjustment_times': '调价次数',
'bazhuayu_case.longitude': '经度',
'bazhuayu_case.latitude': '纬度'}
pi = math.pi
EARTH_REDIUS = 6378.137
经度系数 1米所对应的经度
LONGITUDE_COEF = 0.000011
纬度系数 1米所对应的度度
LATITUDE_COEF = 0.000009
###################################### 读取数据 ###################################################
参数配置
set_parm = pd.read_excel('小区画像配置/小区画像配置.xlsx')
城市
city = set_parm['xh_city'][0]
city = '深圳市'
poi数据文件
path_poi = '高德POI/%s.csv' %city
with open(path_poi, 'r', encoding='utf-8')as f:
poi = pd.read_csv(f)
小区清单数据文件
path_community = '小区清单/八爪鱼小区清单20191230.csv'
community = pd.read_csv(path_community, engine='python',encoding='utf-8')
community = community[community['城市'] == city]
百强开发商物业
developer = pd.read_excel('百强物业开发商/百强开发商.xlsx')
property = pd.read_excel('百强物业开发商/百强物业.xlsx')
映射表
yingshe = pd.read_excel('映射表/%s.xlsx'%city)
八爪鱼挂牌案例
txt_name = os.listdir('城市八爪鱼数据/%s' % city)[0]
with open('城市八爪鱼数据/%s/%s' % (city, txt_name),'r',encoding='utf-8')as f:
case_data = pd.read_csv(f, sep='$', error_bad_lines=False,
usecols=cols_dict.keys(), low_memory=True)
法拍数据
fapai = pd.read_csv('法拍数据/北京重庆大连哈尔滨_京东阿里法拍.csv', engine='python', encoding='utf8')
####################################################################################################
def mape(estimatory, X, y):
y_ = estimatory.predict(X)
return (np.abs(y - y_)/y).mean()
def rad(d):
return d * pi / 180.0
测算两点经纬度之间的距离
def getDistance1(lat1, lng1, lat2, lng2):
radLat1 = rad(lat1)
radLat2 = rad(lat2)
a = radLat1 - radLat2
b = rad(lng1) - rad(lng2)
s = 2 * math.asin(math.sqrt(math.pow(math.sin(a/2), 2) + math.cos(radLat1) * math.cos(radLat2) * math.pow(math.sin(b/2), 2)))
s = s * EARTH_REDIUS
return s * 1000
def getDistance(lon1, lat1, lon2, lat2):
#计算距离
# 将十进制度数转化为弧度
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine公式
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6378.137 # 地球平均半径,单位为公里
return c * r * 1000
def select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, len_coef):
poi_data_spilt = poi_data[ (temp_comm_longitude + LONGITUDE_COEF * len_coef > poi_data['longitude'] ) &
(poi_data['longitude'] > temp_comm_longitude - LONGITUDE_COEF * len_coef) &
(temp_comm_latitude + LONGITUDE_COEF * len_coef > poi_data['latitude'] ) &
(poi_data['latitude'] > temp_comm_latitude - LONGITUDE_COEF * len_coef) ]
return poi_data_spilt
体育馆最近距离
def get_turth_distance(poi_data_all, read_community_data):
dict_temp = {
'体育馆':["['综合体育馆']"],
'幼儿园':["['幼儿园']"],
'小学':["['小学']","['中学', '小学']", "['小学', '中学']"],
'购物中心':["['购物中心']"],
'三级甲等医院':["['三级甲等医院']"],
'政府机关':["['区县级政府及事业单位']","['省直辖市级政府及事业单位']"],
'火车站':["['火车站']"],
'景区':["['国家级景点']", "['省级景点']"],
'公园':["['公园']"],
'高等院校':["['高等院校']"],
'地铁站': ["['地铁站']"],
'飞机场':["['飞机场']"],
'城市中心':["['城市中心']"],
'城市广场':["['城市广场']"],
}
data1=dict_temp.items()
community_data = read_community_data[['省份', '楼盘名称', '高德经度', '高德纬度']].values
list_distance_min=[]
for key,value in data1:
poi_data=poi_data_all.copy()
if key=='政府机关':
poi_data = poi_data[((poi_data['sub_category']=="['区县级政府及事业单位']")|(poi_data['sub_category']=="['省直辖市级政府及事业单位']"))&(poi_data['name'].str.endswith('人民政府')) ]
elif key=='小学':
poi_data=poi_data[((poi_data['sub_category']=="['小学']")|(poi_data['sub_category']=="['中学', '小学']")|(poi_data['sub_category']=="['小学', '中学']"))]
elif key=='景区':
poi_data=poi_data[((poi_data['sub_category']=="['国家级景点']")|(poi_data['sub_category']=="['省级景点']"))]
else:
print(value)
poi_data =poi_data[(poi_data['sub_category']==value[0])]
# 小区清单列表
list_distance_min=[]
print(value, 'poi数据:', len(poi_data), '小区清单列表:', len(read_community_data))
# 遍历小区清单
for temp_comm in tqdm(community_data):
# 经度
temp_comm_longitude = float(temp_comm[2])
# 纬度
temp_comm_latitude = float(temp_comm[3])
list_distance_temp=[]
# 过滤经纬度
temp_poi_data = poi_data[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values
for temp_poi in temp_poi_data:
# 经度
temp_poi_longitude = float(temp_poi[2])
# 纬度
temp_poi_latitude = float(temp_poi[3])
# 距离
distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)
list_distance_temp.append(distance)
distance_min=min(list_distance_temp)if len(list_distance_temp)!=0 else 0
list_distance_min.append(distance_min)
read_community_data['%s'%(key)]=list_distance_min
return read_community_data
购物中心标签
def get_turth_number(poi_data_all, read_community_data):
dict_temp = {
'商务写字楼':["['商务写字楼']"],
'丧葬设施':["['丧葬设施']"],
'商场':["['商场']"],
'路口名':["['路口名']"],
'公交车站相关':["['公交车站相关']"],
'工厂':["['工厂']"],
}
data1=dict_temp.items()
community_data = read_community_data[['省份', '楼盘名称', '高德经度', '高德纬度']].values
for key,value in data1:
poi_data=poi_data_all.copy()
print(value)
poi_data =poi_data[(poi_data['sub_category']==value[0])]
# 小区清单列表
list_number=[]
print(value, 'poi数据:', len(poi_data), '小区清单列表:', len(read_community_data))
# 遍历小区清单
for temp_comm in tqdm(community_data):
# 经度
temp_comm_longitude = float(temp_comm[2])
# 纬度
temp_comm_latitude = float(temp_comm[3])
list_number_temp=[]
# 过滤经纬度
poi_data_spilt = select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, 2000)
temp_poi_data = poi_data_spilt[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values
for temp_poi in temp_poi_data:
# 经度
temp_poi_longitude = float(temp_poi[2])
# 纬度
temp_poi_latitude = float(temp_poi[3])
# 距离
distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)
if distance<=1000:
list_number_temp.append(distance)
number=len(list_number_temp)
list_number.append(number)
read_community_data['%s'%(key)]=list_number
return read_community_data
class CaseValue():
def __init__(self, city_name):
self.city_name = city_name
self.df = self.read_and_process_data()
self.data = self.get_info(self.df)
self.result = self.match_c()
self.result = self.fill_nan(self.result)
def read_and_process_data(self):
year, month = 2019, 1
cols_dict = \
{'bazhuayu_case.case_type': '案例类型',
'bazhuayu_case.case_source': '案例来源',
'bazhuayu_case.district': '区域',
'bazhuayu_case.community_sources': '案例源小区名称',
'bazhuayu_case.flr_total_ind': '总楼层',
'bazhuayu_case.checked_time': '看房次数',
'bazhuayu_case.attention_time': '关注量',
'bazhuayu_case.browse_time': '浏览次数',
'bazhuayu_case.list_time': '挂牌时间',
'bazhuayu_case.transaction_time':'成交时间',
'bazhuayu_case.list_totalprice': '挂牌总价(万元)',
'bazhuayu_case.list_unitprice': '挂牌单价(元/㎡)',
'bazhuayu_case.transaction_price': '成交总价(万元)',
'bazhuayu_case.transaction_avg_price': '成交单价(元/㎡)',
'bazhuayu_case.community_price': '案例源小区均价(元/㎡)',
'bazhuayu_case.transaction_cycle': '成交周期',
'bazhuayu_case.price_adjustment_times': '调价次数',
'bazhuayu_case.longitude': '经度',
'bazhuayu_case.latitude': '纬度'}
# TODO: 读取数据
df = case_data.copy()
df.columns = df.columns.map(cols_dict)
# 逗号过滤
df['案例源小区名称'] = df['案例源小区名称'].astype(str)
df['案例源小区名称'] = df['案例源小区名称'].str.replace(',', '')
# TODO: 八爪鱼小区名称映射
df_community = yingshe
df_community['映射字段'] = df_community['district'] + df_community['community_sources']
d = dict(zip(df_community['映射字段'], df_community['man_community_id']))
df['映射字段'] = df['区域'] + df['案例源小区名称']
df['匹配ID'] = df['映射字段'].map(d)
n = df.shape[0]
n_nan = df[df['匹配ID'].isna()].shape[0]
print('总数据量:%s, 可映射数据%s, 不可映射数据%s'%(n, n-n_nan, n_nan))
df = df[df['匹配ID'].notna()]
def func(x):
if pd.isna(x['挂牌时间']):
if pd.isna(x['成交时间']):
return np.nan
else:
return x['成交时间']
else:
return x['挂牌时间']
def func2(x):
if pd.isna(x['成交单价(元/㎡)']) :
return x['挂牌单价(元/㎡)'] * 0.95
else:
return x['成交单价(元/㎡)']
df['时间'] = df.apply(func, axis=1)
df['时间'] = df['时间'].map(lambda x: np.nan if len(str(x)) > 10 else x)
df['时间']= pd.to_datetime(df['时间'], errors='coerce')
df = df[df['时间'].notna()]
df = df[df['时间']>pd.to_datetime('%s-%s-01'%(year, month))]
df['时间'] = df['时间'].map(lambda x: (x.year, x.month))
cols = ['案例源小区均价(元/㎡)', '挂牌总价(万元)', '挂牌单价(元/㎡)', '成交总价(万元)',
'成交单价(元/㎡)', '总楼层']
for col in cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
df['价格'] = df[['挂牌单价(元/㎡)', '成交单价(元/㎡)']].apply(func2, axis=1)
df = df[['匹配ID', '案例源小区均价(元/㎡)', '挂牌总价(万元)', '挂牌单价(元/㎡)', '成交总价(万元)',
'成交单价(元/㎡)', '总楼层','时间','价格']]
df.drop_duplicates(subset=['匹配ID', '价格'], inplace=True)
return df
def get_info(self, df):
data = []
for c_id, group in tqdm(df.groupby(['匹配ID']), desc='挂牌数据计算中'):
n_chengjiao = group[group['成交单价(元/㎡)'].notna()].shape[0]
n_guapai = group[group['挂牌单价(元/㎡)'].notna()].shape[0]
n_floor = group['总楼层'].median()
avg_price = group['价格'].median()
case_price = group['案例源小区均价(元/㎡)'].median()
var_price = group['案例源小区均价(元/㎡)'].var()
if pd.isna(case_price):
price = avg_price
if price < 1000:
price = np.nan
else:
price = case_price
if price < 1000:
price = np.nan
a = group.groupby('时间').mean()
rate = (a['价格'].diff() / a['价格']).mean()
data.append([c_id, n_chengjiao, n_guapai, n_floor, price, rate, var_price])
data = pd.DataFrame(data, columns=['楼盘ID','成交数量','挂牌数量','总楼层','小区均价','抗跌率', '价格方差'])
return data
def match_c(self):
# TODO: 读取小区清单
result = pd.merge(community[['楼盘ID']], self.data,
on='楼盘ID', how='left')
return result
def fill_nan(self, result):
result['成交数量'].fillna(0, inplace=True)
result['挂牌数量'].fillna(0, inplace=True)
result['总楼层'].fillna(result['总楼层'].quantile(0.4), inplace=True)
result['抗跌率'].fillna(result['抗跌率'].quantile(0.4), inplace=True)
return result
处理填充
def community_data(df):
# 填充房屋类型的空值
housing_type_fill=str(df['房屋类型(小类)'].mode())
df.fillna({'房屋类型(小类)':housing_type_fill})
# 填充绿化率和容积率的空值,根据房屋类型分组填充
cols=[col for col in df.columns if col in ['绿化率','容积率'] ]
gp_col='房屋类型(小类)'
df_na=df[cols].isna()
df_mean=df.groupby(gp_col)[cols].mean()
for col in cols:
na_series=df_na[col]
names=list(df.loc[na_series,gp_col])
t=df_mean.loc[names,col]
t.index=df.loc[na_series,col].index
df.loc[na_series,col]=t
# 填充其他字段的空值
df['占地面积'] = pd.to_numeric(df['占地面积'], errors='coerce')
df['地上车位'] = pd.to_numeric(df['地上车位'], errors='coerce')
df['地下车位'] = pd.to_numeric(df['地下车位'], errors='coerce')
build_date_fill=int(df['建成年份'].mode())
green_rate_fill=df['绿化率'].mean()
plot_rate_fill=df['容积率'].mean()
floor_area_fill=df['占地面积'].mean()
house_num_fill=df['总户数'].mean()
manage_type_fill=str(df['管理形式'].mode())
column_list=df.fillna({'管理形式':manage_type_fill,'占地面积':floor_area_fill,'总户数':house_num_fill,'绿化率':green_rate_fill,
'容积率':plot_rate_fill,'建成年份':build_date_fill,'地上车位':0,'地下车位':0})
result=column_list[['楼盘ID','楼盘名称','行政区','绿化率','容积率','管理形式','地上车位','地下车位','占地面积','建成年份','总户数']]
column_list['车位比']=column_list.apply(lambda x:(x['地上车位']+x['地下车位'])/x['总户数'],axis=1)
return column_list
def value2score(data):
# 1:值越小分数越高 0:值越大分数越高
dict_parm = {}
[dict_parm.update({parm:'宜居程度'}) for parm in set_parm['xh_livable'].dropna().values]
[dict_parm.update({parm:'生活配套'}) for parm in set_parm['xh_life'].dropna().values]
[dict_parm.update({parm:'区位状况'}) for parm in set_parm['xh_location'].dropna().values]
[dict_parm.update({parm:'楼盘品质'}) for parm in set_parm['xh_house'].dropna().values]
[dict_parm.update({parm:'活跃程度'}) for parm in set_parm['xh_active'].dropna().values]
[dict_parm.update({parm:'不利因素'}) for parm in set_parm['xh_disadvantage'].dropna().values]
usecols = {
'地铁站': (1, dict_parm['地铁站']),
'建成年份': (0, dict_parm['建成年份']),
'绿化率': (0, dict_parm['绿化率']),
'容积率': (0, dict_parm['容积率']),
'体育馆': (1, dict_parm['体育馆']),
'幼儿园': (1, dict_parm['幼儿园']),
'总户数': (0, dict_parm['总户数']),
'占地面积': (0, '楼盘品质'),
'小学': (1, dict_parm['小学']),
'购物中心': (1, dict_parm['购物中心']),
'三级甲等医院': (1, dict_parm['三级甲等医院']),
'政府机关': (0, dict_parm['政府机关']),
'景区': (1, dict_parm['景区']),
'公园': (1, dict_parm['公园']),
'高等院校': (1, dict_parm['高等院校']),
'城市中心': (1, '区位状况'),
'城市广场': (0, dict_parm['城市广场']),
'商务写字楼': (0,dict_parm['商务写字楼']),
'丧葬设施': (1, dict_parm['丧葬设施']),
'商场': (0, dict_parm['商场']),
'路口名': (0, dict_parm['路口名']),
'公交车站相关': (0, dict_parm['公交车站相关']),
'工厂': (0, dict_parm['工厂']),
'成交数量': (0, dict_parm['成交数量']),
'挂牌数量': (0, dict_parm['挂牌数量']),
'总楼层': (0, dict_parm['总楼层']),
'抗跌率': (0, '活跃程度'),
'车位比': (0,dict_parm['车位比']),
'加油站': (0, dict_parm['加油站']),
'火车站': (0,dict_parm['火车站']),
'机场': (0, dict_parm['机场']),
'行政区分数':(0, dict_parm['行政区分数']),
}
data['建成年份'] = pd.to_numeric(data['建成年份'],errors='coerce')
data['建成年份'].fillna(data['建成年份'].quantile(0.5),inplace=True)
# 处理行政区分数
distruct_value = data[['小区均价','行政区']].groupby('行政区').mean().fillna(0)
dv_dict = dict(zip(distruct_value.index, distruct_value['小区均价']))
data['行政区分数'] = data['行政区'].map(dv_dict)
for col_name, col in data.iteritems():
if col_name in usecols.keys():
cat = usecols[col_name][1]
false = False
if false:
pass
elif false:
pass
else:
t = usecols[col_name][0]
data.loc[data[col_name] >= data[col_name].quantile(0.75), col_name + '分数'] = t*-4 + 5
data.loc[(data[col_name] < data[col_name].quantile(0.75)) & (
data[col_name] >= data[col_name].quantile(0.50)), col_name + '分数'] = t*-2 + 4
data.loc[(data[col_name] < data[col_name].quantile(0.50)) & (
data[col_name] >= data[col_name].quantile(0.30)), col_name + '分数'] = 3
data.loc[(data[col_name] < data[col_name].quantile(0.30)) & (
data[col_name] >= data[col_name].quantile(0.15)), col_name + '分数'] = t*2 + 2
data.loc[data[col_name] < data[col_name].quantile(0.15), col_name + '分数'] = t*4 + 1
if cat in data.columns:
data[cat] = pd.to_numeric(data[cat], errors='coerce')
data[cat] += data[col_name + '分数']
else:
data[cat] = data[col_name + '分数']
return data
def get_label(data):
developers = developer['公司名称'].values
propertys = property['公司名称'].values
data_temp = data
# 景点
xh_viewpoint = set_parm['xh_viewpoint'][0]
# 公园
xh_park = set_parm['xh_park'][0]
# 医院
xh_hospital = set_parm['xh_hospital'][0]
# 商务区
xh_business = set_parm['xh_business'][0]
# 地铁站
xh_subway = set_parm['xh_subway'][0]
# 政府机关
xh_government = set_parm['xh_government'][0]
# 购物中心
xh_shopping = set_parm['xh_shopping'][0]
# 总户数
xh_number = set_parm['xh_number'][0]
# 车位
xh_car_first = set_parm['xh_car'][0]
xh_car_second = set_parm['xh_car'][1]
# 容积率
xh_plot_first = set_parm['xh_plot'][0]
xh_plot_second = set_parm['xh_plot'][1]
# 绿化率
xh_green_first = set_parm['xh_green'][0]
xh_green_second = set_parm['xh_green'][1]
# 火车站
xh_trian = set_parm['xh_trian'][0]
# 机场
xh_airport = set_parm['xh_airport'][0]
# 丧葬设施
xh_funeal = set_parm['xh_funeal'][0]
# 工厂
xh_factory = set_parm['xh_factory'][0]
# 法拍
xh_vendue_first = set_parm['xh_vendue'][0]
xh_vendue_second = set_parm['xh_vendue'][1]
# 输出标签设置
show_cols = list(set_parm['xh_label'].values)
dffb = fapai
data_temp['法拍数量'] = [len(dffb[dffb['匹配ID'] == ID].drop_duplicates(subset=['匹配ID', 'title'])) for ID in data_temp['楼盘ID'].values]
data_temp.loc[(data['法拍数量'] > 0) & (data_temp['法拍数量'] <= xh_vendue_first), '法拍标签'] = '含法拍'
data_temp.loc[(data['法拍数量'] > xh_vendue_first) & (data_temp['法拍数量'] < xh_vendue_second), '法拍标签'] = '法拍数量多'
data_temp.loc[data['法拍数量'] >= xh_vendue_second, '法拍标签'] = '法拍数量极多'
data_temp['容积率'] = pd.to_numeric(data_temp['容积率'], errors='coerce')
data_temp.loc[data_temp['容积率'] <= xh_plot_first, '容积率标签'] = '容积率低'
data_temp.loc[(data_temp['容积率'] > xh_plot_first ) & (data_temp['容积率'] < xh_plot_second ), '容积率标签'] = '容积率中'
data_temp.loc[data_temp['容积率'] >= xh_plot_second, '容积率标签'] = '容积率高'
data_temp.loc[data_temp['容积率'] >= xh_plot_second, '容积率标签'] = '容积率高'
data_temp['绿化率'] = pd.to_numeric(data_temp['绿化率'], errors='coerce')
data_temp.loc[data_temp['绿化率'] <= xh_green_first, '绿化率标签'] = '绿化率低'
data_temp.loc[(data_temp['绿化率'] > xh_green_first ) & (data_temp['绿化率'] < xh_green_second ), '绿化率标签'] = '绿化率中'
data_temp.loc[data_temp['绿化率'] >= xh_green_second, '绿化率标签'] = '绿化率高'
data_temp['总户数'] = pd.to_numeric(data_temp['总户数'], errors='coerce')
data_temp.loc[(data_temp['总户数'] >= data_temp['总户数'].quantile(0.85)) & (data_temp['总户数'] > xh_number), '小区规模标签'] = '大型社区'
data_temp.loc[data_temp['总户数'] <= data_temp['总户数'].quantile(0.15), '小区规模标签'] = '小型社区'
data_temp.loc[(data_temp['车位比'] <= xh_car_first),'停车位标签'] = '停车位紧缺'
data_temp.loc[(data_temp['车位比'] > xh_car_first) & (data_temp['车位比'] < xh_car_second),'停车位标签'] = '停车位一般'
data_temp.loc[data_temp['车位比'] >= xh_car_second,'停车位标签'] = '停车位充裕'
data_temp['建成年份'] = pd.to_numeric(data_temp['建成年份'], errors='coerce')
data_temp.loc[data_temp['建成年份'] >= 2015, '楼龄标签'] = '次新房'
data_temp.loc[(data_temp['建成年份'] >= 2010) & (data_temp['建成年份'] < 2015), '楼龄标签'] = '6-10年楼龄'
data_temp.loc[(data_temp['建成年份'] >= 2005) & (data_temp['建成年份'] < 2010), '楼龄标签'] = '10-15年楼龄'
data_temp.loc[(data_temp['建成年份'] >= 2000) & (data_temp['建成年份'] < 2005), '楼龄标签'] = '15-20年楼龄'
data_temp.loc[data_temp['建成年份'] < 2000, '楼龄标签'] = '老旧小区'
data_temp.loc[data_temp['开发商'].isin(developers), '百强开发商标签'] = '百强开发商'
data_temp.loc[data_temp['物业公司'].isin(propertys), '百强物业标签'] = '百强物业'
data_temp.loc[data_temp['挂牌数量'] >= data_temp['挂牌数量'].quantile(0.85), '活跃度标签'] = '挂盘活跃'
data_temp.loc[data_temp['挂牌数量'] <= data_temp['挂牌数量'].quantile(0.15), '停车位标签'] = '挂盘不活跃'
data_temp.loc[data_temp['购物中心'] < xh_shopping, '购物中心标签'] = '近购物中心'
data_temp.loc[data_temp['三级甲等医院'] < xh_hospital, '三级甲等医院标签'] = '近三甲医院'
data_temp.loc[data_temp['政府机关'] < xh_government, '政府机关标签'] = '近政府机关'
data_temp.loc[data_temp['火车站'] < xh_trian, '火车站标签'] = '近火车站'
data_temp.loc[data_temp['景区'] < xh_viewpoint, '景区标签'] = '近景区'
data_temp.loc[data_temp['公园'] < xh_park, '公园标签'] = '近公园'
data_temp.loc[data_temp['地铁站'] < xh_subway, '地铁标签'] = '近地铁站'
data_temp.loc[data_temp['飞机场'] < xh_airport, '机场标签'] = '距离机场过近'
data_temp.loc[data_temp['丧葬设施'] > xh_funeal, '丧葬设施标签'] = '距离丧葬设施过近'
data_temp.loc[data_temp['工厂'] > xh_factory, '工厂标签'] = '距离工厂过近'
data_temp.loc[(data_temp['商务写字楼'] >= data_temp['商务写字楼'].quantile(0.85)) & (data_temp['商务写字楼'] > xh_business), '商务区标签'] = '商务区'
#配套设施
data_temp['配套设施分数'] = data_temp['公交车站相关分数'] + data_temp['三级甲等医院分数'] + data_temp['商场分数'] + data_temp['体育馆分数']
data_temp.loc[(data_temp['配套设施分数'] >= data_temp['配套设施分数'].quantile(0.85)), '配套设施标签'] = '配套设施完善'
# 交易活跃度
data_temp['交易活跃度分数'] = data_temp['成交数量分数'] + data_temp['挂牌数量分数']
transa_temp = data_temp[data_temp['交易活跃度分数'] > 0]['交易活跃度分数']
transa_first = transa_temp.quantile(0.8)
transa_second = transa_temp.quantile(0.3)
data_temp.loc[(data_temp['交易活跃度分数'] >= transa_first), '交易活跃度标签'] = '交易活跃'
data_temp.loc[(data_temp['交易活跃度分数'] >= transa_second) & (data_temp['交易活跃度分数'] < transa_first), '交易活跃度标签'] = '交易一般活跃'
data_temp.loc[(data_temp['交易活跃度分数'] < transa_second), '交易活跃度标签'] = '交易不活跃'
data_temp.loc[(data_temp['交易活跃度分数'] == 0), '交易活跃度标签'] = '无交易'
# 价格波动率
data_temp.loc[abs(data_temp['抗跌率']) >= 0.1, '价格波动率标签'] = '价格波动大'
data_temp.loc[(abs(data_temp['抗跌率']) > 0.05) & (abs(data_temp['抗跌率']) < 0.1) , '价格波动率标签'] = '价格波动一般'
data_temp.loc[abs(data_temp['抗跌率']) <= 0.05, '价格波动率标签'] = '价格稳定'
# 性价比
data_temp['性价比'] = data_temp['综合评分'] / data_temp['小区均价']
show_cols = data_temp.columns[data_temp.columns.str.contains('标签')]
# 筛选标签
show_cols = xh_label
# 计算展示标签
for idx, row in data_temp.iterrows():
labels = []
for col in show_cols:
if pd.isna(row[col]):
pass
else:
labels.append(row[col])
data_temp.loc[idx, '展示标签'] = ','.join(labels)
return data_temp
def train(data):
use_cols = [ '容积率',
'绿化率',
'占地面积',
'建成年份',
'总楼层',
'城市中心',
'抗跌率',
'购物中心',
'三级甲等医院',
'高等院校',
'商务写字楼',
'公交车站相关',
'挂牌数量',
'路口名',
'商场',
'工厂',
'政府机关',
'丧葬设施',
'体育馆',
'公园',
'地铁站',
'小学',
'幼儿园',
'火车站',
'小区均价']
train_data = data[use_cols]
X = train_data[train_data['小区均价'].notna()].iloc[:, :-1]
y = train_data[train_data['小区均价'].notna()].iloc[:, -1]
model = XGBRegressor(n_jobs=-1)
param_grid = {'max_depth': np.arange(2, 8, 1),
'gamma': np.arange(0.5, 0.8, 0.1),
'colsample_bytree': np.arange(0.5, 0.8, 0.1)}
gs = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, verbose=1, cv=5, scoring=mape)
gs.fit(X, y)
print(gs.best_params_)
print(gs.best_score_)
X_ = train_data.iloc[:, :-1]
y_ = gs.predict(X_)
data['综合评分'] = y_
return data
def change_poi(poi_data):
location = poi_data['location'].str.split(',')
list_longitude = []
list_latitude = []
[(list_longitude.append(x[0]), list_latitude.append(x[1])) for x in location]
poi_data['longitude'] = list_longitude
poi_data['latitude'] = list_latitude
poi_data['longitude'] =poi_data['longitude'].astype(float)
poi_data['latitude'] =poi_data['latitude'].astype(float)
return poi_data
def get_level(result):
result.loc[result['综合评分'] >= result['综合评分'].quantile(0.8), '综合评级'] = 'A'
result.loc[(result['综合评分'] < result['综合评分'].quantile(0.8)) &
(result['综合评分'] >= result['综合评分'].quantile(0.6)), '综合评级'] = 'B'
result.loc[(result['综合评分'] < result['综合评分'].quantile(0.6)) &
(result['综合评分'] >= result['综合评分'].quantile(0.4)), '综合评级'] = 'C'
result.loc[(result['综合评分'] < result['综合评分'].quantile(0.4)) &
(result['综合评分'] >= result['综合评分'].quantile(0.2)), '综合评级'] = 'D'
result.loc[result['综合评分'] < result['综合评分'].quantile(0.2), '综合评级'] = 'E'
# TODO: 小区均价 --> 小区价格评级
result.loc[result['小区均价'] >= result['小区均价'].quantile(0.8), '小区价格评级'] = 'A'
result.loc[(result['小区均价'] < result['小区均价'].quantile(0.8)) &
(result['小区均价'] >= result['小区均价'].quantile(0.6)), '小区价格评级'] = 'B'
result.loc[(result['小区均价'] < result['小区均价'].quantile(0.6)) &
(result['小区均价'] >= result['小区均价'].quantile(0.4)), '小区价格评级'] = 'C'
result.loc[(result['小区均价'] < result['小区均价'].quantile(0.4)) &
(result['小区均价'] >= result['小区均价'].quantile(0.2)), '小区价格评级'] = 'D'
result.loc[result['小区均价'] < result['小区均价'].quantile(0.2), '小区价格评级'] = 'E'
# TODO: 填充评级
for idx, row in result.iterrows():
if pd.isna(row['小区价格评级']):
result.loc[idx, '小区价格评级'] = row['综合评级']
# TODO: 数据重排
n = result.shape[0]
for col in ['楼盘品质', '宜居程度', '区位状况',
'生活配套', '活跃程度', '不利因素','综合评分']:
result.sort_values(col, inplace=True)
result[col] = np.linspace(5, 10, n)
return result
def get_risk(all_community_data):
list_one = ['A'] * 5 + ['B'] * 5 + ['C'] * 5 + ['D'] * 5 + ['E'] * 5
list_two = ['A', 'B', 'C', 'D', 'E'] * 5
list_str = ['正常', '关注', '谨慎', '谨慎', '谨慎', '正常', '正常', '关注', '谨慎', '谨慎', '正常', '正常', '正常', '关注', '谨慎', '正常', '正常',
'关注', '谨慎', '谨慎', '正常', '关注', '谨慎', '谨慎', '谨慎', ]
risk_data = pd.DataFrame({'小区价格评级': list_one, '综合评级': list_two, '策略': list_str})
list_risk = []
for price, score in all_community_data[['小区价格评级', '综合评级']].values:
risk_temp = risk_data[(risk_data['小区价格评级'] == price) & (risk_data['综合评级'] == score)]['策略'].values[0]
list_risk.append(risk_temp)
all_community_data['风险策略'] = list_risk
return all_community_data
def high_quality_community(data):
df = data.copy()
df = df[df['综合评级'].isin(['C', 'B', 'A'])]
print('综合评级:',len(df))
df = df[df['小区价格评级'].isin(['A', 'B', 'C', 'D'])]
print('小区价格评级:',len(df))
df = df[df['风险策略'].isin(['正常', '关注'])]
print('风险策略:',len(df))
df = df[df['房屋类型(小类)'] == '普通住宅']
print('房屋类型:',len(df))
df = df[df['建成年份'] > 2000]
print('建成年份:',len(df))
up = 0.05
down = 0
df = df[(np.abs(df['抗跌率']) >= down) & (np.abs(df['抗跌率']) <=up)]
df = df[df['价格波动率标签'] == '价格稳定']
print('价格波动率:',len(df))
# 变现能力
df = df[df['法拍数量'] == 0]
print('法拍数量:',len(df))
# 性价比
df = df[df['性价比'] > df['性价比'].quantile(0.6) ]
print('性价比:',len(df))
df = df[df['价格方差'] < df['价格方差'].quantile(0.4) ]
print('价格集中度:',len(df))
data.loc[df.index, '优质小区'] = '是'
return data
if name=='main':
print('城市: ', city)
start_time1 = time.time()
# 获取poi数据
poi_data = change_poi(poi)
# 获取小区清单数据
read_community_data = community
read_community_data.dropna(subset=['高德经度'],inplace=True)
# TODO: 解析POI数据
start_time = time.time()
read_community_data = get_turth_distance(poi_data, read_community_data)
read_community_data = get_turth_number(poi_data, read_community_data)
print('poi数据运行时间:',time.time() - start_time)
# TODO: 解析案例数据
start_time = time.time()
case_value = CaseValue(city)
result = case_value.result
read_community_data = pd.merge(read_community_data, result, on='楼盘ID', how='left')
print('案例数据运行时间:',time.time() - start_time)
# TODO: 处理小区清单数据
start_time = time.time()
read_community_data=community_data(read_community_data)
print('全部运行时间:',time.time() - start_time1)
# TODO: 原始值出分
read_community_data = value2score(read_community_data)
# read_community_data.to_excel('训练数据_%s.xlsx'%city)
# TODO: 训练模型
read_community_data = train(read_community_data)
# TODO: 出标签
start_time = time.time()
read_community_data = get_label(read_community_data, )
print('出标签时间:',time.time() - start_time)
# TODO: 小区评级
result = get_level(read_community_data)
# TODO: 风险策略
result = get_risk(result)
# TODO: 添加优质小区
result = high_quality_community(result)
result.to_excel('小区画像_%s.xlsx'%city, index=False)
result_cols = [
'省份','城市','城市代号','行政区','行政区代号','楼盘名称','楼盘ID','地址',
'小区均价','小区价格评级','综合评级','综合评分','风险策略','展示标签','楼盘品质',
'宜居程度','区位状况','生活配套','活跃程度','不利因素','优质小区']
result = result[result_cols]
result.to_excel('小区评级结果表_%s.xlsx'%city,index=False)