2020-01-13

# -*- coding: utf-8 -*-^"""^Created on Thu Jan 9 11:36:00 2020^^@author: QIAOQICHAO258^"""^^^^^import pandas as pd^import numpy as np^import time^from tqdm import tqdm^import codecs^import csv^import os^from math import radians, cos, sin, asin, sqrt ^import math^from casevalue import CaseValue^from xgboost import XGBRegressor^from sklearn.model_selection import GridSearchCV^from sklearn.preprocessing import minmax_scale^pi = math.pi^EARTH_REDIUS = 6378.137^^# 经度系数 1米所对应的经度^LONGITUDE_COEF = 0.000011^# 纬度系数 1米所对应的度度^LATITUDE_COEF = 0.000009^^# 城市^city = '深圳市'^# poi数据文件^path_poi = '高德POI/%s.xlsx'%city^# 小区清单数据文件^path_community = '小区清单/小区清单_高德地址_%s.xlsx'%city^^^ ^^def rad(d):^ return d * pi / 180.0^^# 测算两点经纬度之间的距离^def getDistance1(lat1, lng1, lat2, lng2):^ radLat1 = rad(lat1)^ radLat2 = rad(lat2)^ a = radLat1 - radLat2^ b = rad(lng1) - rad(lng2)^ s = 2 * math.asin(math.sqrt(math.pow(math.sin(a/2), 2) + math.cos(radLat1) * math.cos(radLat2) * math.pow(math.sin(b/2), 2)))^ s = s * EARTH_REDIUS^ return s * 1000^^def getDistance(lon1, lat1, lon2, lat2):^ #计算距离 ^^ # 将十进制度数转化为弧度 ^ lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) ^ # haversine公式 ^ dlon = lon2 - lon1 ^ dlat = lat2 - lat1 ^ a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2^ c = 2 * asin(sqrt(a)) ^ r = 6378.137 # 地球平均半径,单位为公里 ^ return c * r * 1000^^# 获取poi数据^def get_poi_data(poi_path):^ ^ read_poi_data = pd.read_excel(poi_path)^# read_poi_data = pd.read_csv(poi_path, encoding='utf-8', engine='python')^ ^ return read_poi_data^^^# 获取小区清单的数据 ^def get_community_data(comm_path):^ ^ read_community_data = pd.read_excel(comm_path)^^ return read_community_data^^# 案例挂牌数据^def get_shilian_data(path_data):^ ^ # 不能中文路径^# shilian_data = pd.read_table(shilian_path, sep='$', error_bad_lines=False,encoding='utf-8', low_memory=False,quoting=csv.QUOTE_NONE)^ ^ # 可以中文路径^ with codecs.open(path_data, 'r' ,encoding='utf-8', errors='ignore') as f:^ shilian_data = pd.read_csv(f, sep='$', error_bad_lines=False, low_memory=False, quoting=csv.QUOTE_NONE)^ # 读取城市的映射表 ^ bazhuayu = pd.read_excel(r'D:\Users\QIAOQICHAO258\Desktop\合作小区画像代码\各类数据\新映射表\九月八爪鱼案例-深圳交付.xlsx')^ # 案例名称和ID^ dict_temp = dict(zip(bazhuayu['community_sources'], bazhuayu['匹配ID-汇总']))^ # 映射^ shilian_data['fang_case.community_id'] = shilian_data['bazhuayu_case.community_sources'].map(dict_temp)^ ^ return shilian_data^^^^def select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, len_coef):^ ^# start_T = time.time()^ ^ ^ ^ poi_data_spilt = poi_data[ (temp_comm_longitude + LONGITUDE_COEF * len_coef > poi_data['longitude'] ) & ^ (poi_data['longitude'] > temp_comm_longitude - LONGITUDE_COEF * len_coef) &^ (temp_comm_latitude + LONGITUDE_COEF * len_coef > poi_data['latitude'] ) &^ (poi_data['latitude'] > temp_comm_latitude - LONGITUDE_COEF * len_coef) ]^# print(time.time() - start_T)^# return poi_data[ (temp_comm_longitude + LONGITUDE_COEF * len_coef > poi_data['longitude'] ) & (poi_data['longitude'] > temp_comm_longitude - LONGITUDE_COEF * len_coef) &(temp_comm_latitude + LONGITUDE_COEF * len_coef > poi_data['latitude'] ) &(poi_data['latitude'] > temp_comm_latitude - LONGITUDE_COEF * len_coef) ]^ return poi_data_spilt^^^#体育馆最近距离^def get_turth_distance(poi_data_all, read_community_data):^ dict_temp = {^ '体育馆':["['综合体育馆']"],^ '幼儿园':["['幼儿园']"],^ '小学':["['小学']","['中学', '小学']", "['小学', '中学']"],^ '购物中心':["['购物中心']"],^ '三级甲等医院':["['三级甲等医院']"],^ '政府机关':["['区县级政府及事业单位']","['省直辖市级政府及事业单位']"],^ '火车站':["['火车站']"],^ '景区':["['国家级景点']", "['省级景点']"],^ '公园':["['公园']"],^ '高等院校':["['高等院校']"],^ '地铁站': ["['地铁站']"],^ '飞机场':["['飞机场']"],^ '城市中心':["['城市中心']"],^ '城市广场':["['城市广场']"],^ }^ data1=dict_temp.items()^ community_data = read_community_data[['省份', '楼盘名称', '高德经度', '高德纬度']].values^ list_distance_min=[] ^ for key,value in data1:^ poi_data=poi_data_all.copy()^ if key=='政府机关':^ poi_data = poi_data[((poi_data['sub_category']=="['区县级政府及事业单位']")|(poi_data['sub_category']=="['省直辖市级政府及事业单位']"))&(poi_data['name'].str.endswith('人民政府')) ]^ elif key=='小学':^ poi_data=poi_data[((poi_data['sub_category']=="['小学']")|(poi_data['sub_category']=="['中学', '小学']")|(poi_data['sub_category']=="['小学', '中学']"))]^ elif key=='景区':^ poi_data=poi_data[((poi_data['sub_category']=="['国家级景点']")|(poi_data['sub_category']=="['省级景点']"))]^ else:^ print(value)^ poi_data =poi_data[(poi_data['sub_category']==value[0])]^ # 小区清单列表^ list_distance_min=[] ^ print(value, 'poi数据:', len(poi_data), '小区清单列表:', len(read_community_data))^ ^ # 遍历小区清单^ for temp_comm in tqdm(community_data): ^ # 经度^ temp_comm_longitude = float(temp_comm[2]) ^ # 纬度^ temp_comm_latitude = float(temp_comm[3]) ^ list_distance_temp=[]^ # 过滤经纬度 ^ temp_poi_data = poi_data[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values^ ^ for temp_poi in temp_poi_data:^ ^ # 经度^ temp_poi_longitude = float(temp_poi[2])^ # 纬度^ temp_poi_latitude = float(temp_poi[3])^ ^ # 距离^ distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)^ list_distance_temp.append(distance)^ distance_min=min(list_distance_temp)if len(list_distance_temp)!=0 else 0^ list_distance_min.append(distance_min)^ read_community_data['%s'%(key)]=list_distance_min^ return read_community_data^^^# 购物中心标签^def get_turth_number(poi_data_all, read_community_data):^ dict_temp = {^^ '商务写字楼':["['商务写字楼']"],^ '丧葬设施':["['丧葬设施']"],^ '商场':["['商场']"],^ '路口名':["['路口名']"],^ '公交车站相关':["['公交车站相关']"],^ '工厂':["['工厂']"],^ }^ data1=dict_temp.items()^ community_data = read_community_data[['省份', '楼盘名称', '高德经度', '高德纬度']].values^ for key,value in data1:^ poi_data=poi_data_all.copy()^ print(value)^ poi_data =poi_data[(poi_data['sub_category']==value[0])]^ # 小区清单列表^ list_number=[] ^ print(value, 'poi数据:', len(poi_data), '小区清单列表:', len(read_community_data))^ ^ # 遍历小区清单^ for temp_comm in tqdm(community_data): ^ # 经度^ temp_comm_longitude = float(temp_comm[2]) ^ # 纬度^ temp_comm_latitude = float(temp_comm[3]) ^ list_number_temp=[]^ # 过滤经纬度^ poi_data_spilt = select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, 2000)^ temp_poi_data = poi_data_spilt[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values^ ^ for temp_poi in temp_poi_data:^ ^ # 经度^ temp_poi_longitude = float(temp_poi[2])^ # 纬度^ temp_poi_latitude = float(temp_poi[3])^^ # 距离^ distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)^ if distance<=1000:^ list_number_temp.append(distance)^ number=len(list_number_temp)^ list_number.append(number)^ read_community_data['%s'%(key)]=list_number^ return read_community_data^^^^#处理填充^def community_data(df):^ # df=pd.read_excel(file_path)^ # print(df['楼盘ID'])^ # 填充房屋类型的空值^ housing_type_fill=str(df['房屋类型(小类)'].mode())^ df.fillna({'房屋类型(小类)':housing_type_fill})^ # 填充绿化率和容积率的空值,根据房屋类型分组填充^ cols=[col for col in df.columns if col in ['绿化率','容积率'] ]^ gp_col='房屋类型(小类)'^ df_na=df[cols].isna()^ df_mean=df.groupby(gp_col)[cols].mean()^ for col in cols:^ na_series=df_na[col]^ names=list(df.loc[na_series,gp_col]) ^ t=df_mean.loc[names,col]^ t.index=df.loc[na_series,col].index^ df.loc[na_series,col]=t^^ # 填充其他字段的空值^ build_date_fill=int(df['建成年份'].mode())^ green_rate_fill=df['绿化率'].mean()^ plot_rate_fill=df['容积率'].mean()^ floor_area_fill=df['占地面积'].mean()^ house_num_fill=df['总户数'].mean()^ manage_type_fill=str(df['管理形式'].mode())^ ^ column_list=df.fillna({'管理形式':manage_type_fill,'占地面积':floor_area_fill,'总户数':house_num_fill,'绿化率':green_rate_fill,^ '容积率':plot_rate_fill,'建成年份':build_date_fill,'地上车位':0,'地下车位':0})^ ^# result=column_list[['楼盘ID','楼盘名称','行政区','绿化率','容积率','管理形式','地上车位','地下车位','占地面积','建成年份','总户数']]^ column_list['车位比']=column_list.apply(lambda x:(x['地上车位']+x['地下车位'])/x['总户数'],axis=1)^# print(result['车位比'])^# result.to_csv('result.csv',index=False,encoding='gbk')^# print(result['楼盘ID'])^ return column_list^^^def value2score(data):^ usecols = {^ '地铁站': (1, '生活配套'),^ '建成年份': (0, '楼盘品质'),^ '绿化率': (0, '楼盘品质'),^ '容积率': (0, '楼盘品质'),^ '体育馆': (1, '宜居程度'),^ '幼儿园': (1, '宜居程度'),^ '总户数': (0, '楼盘品质'),^ '占地面积': (0, '楼盘品质'),^ '小学': (1, '生活配套'),^ '购物中心': (1, '生活配套'),^ '三级甲等医院': (1, '生活配套'),^ '政府机关': (0, '区位状况'),^ '景区': (1, '宜居程度'),^ '公园': (1, '宜居程度'),^ '高等院校': (1, '区位状况'),^ '城市中心': (1, '区位状况'),^ '城市广场': (0, '区位状况'),^ '商务写字楼': (0, '区位状况'),^ '丧葬设施': (1, '不利因素'),^ '商场': (0, '生活配套'),^ '路口名': (0, '区位状况'),^ '公交车站相关': (0, '区位状况'),^ '工厂': (0, '不利因素'),^ '成交数量': (0, '活跃程度'),^ '挂牌数量': (0, '活跃程度'),^ '总楼层': (0, '楼盘品质'),^ '抗跌率': (0, '活跃程度'),^ '车位比': (0, '楼盘品质')}^ data['建成年份'] = pd.to_numeric(data['建成年份'],errors='coerce')^ data['建成年份'].fillna(data['建成年份'].quantile(0.5),inplace=True)^ for col_name, col in data.iteritems():^ if col_name in usecols.keys():^ cat = usecols[col_name][1]^^ t = usecols[col_name][0]^ data.loc[data[col_name] >= data[col_name].quantile(0.85), col_name + '分数'] = t*-4 + 5^ data.loc[(data[col_name] < data[col_name].quantile(0.85)) & (^ data[col_name] >= data[col_name].quantile(0.65)), col_name + '分数'] = t*-2 + 4^ data.loc[(data[col_name] < data[col_name].quantile(0.65)) & (^ data[col_name] >= data[col_name].quantile(0.45)), col_name + '分数'] = 3^ data.loc[(data[col_name] < data[col_name].quantile(0.45)) & (^ data[col_name] >= data[col_name].quantile(0.25)), col_name + '分数'] = t*2 + 2^ data.loc[data[col_name] < data[col_name].quantile(0.25), col_name + '分数'] = t*4 + 1^ if cat in data.columns:^ data[cat] += data[col_name + '分数']^ else:^ data[cat] = data[col_name + '分数']^ return data^^^def get_label(data):^ list_developers = pd.read_excel('百强物业开发商/百强开发商.xlsx')^ list_property = pd.read_excel('百强物业开发商/百强物业.xlsx')^ developers = list_developers['公司名称'].values^ propertys = list_property['公司名称'].values^^ data_temp = data.copy()^^ data_temp.loc[data_temp['容积率'] >= data_temp['容积率'].quantile(0.85), '容积率标签'] = '容积率高'^ data_temp.loc[data_temp['容积率'] <= data_temp['容积率'].quantile(0.15), '容积率标签'] = '容积率低'^^ data_temp.loc[data_temp['绿化率'] >= data_temp['绿化率'].quantile(0.85), '绿化率标签'] = '绿化率高'^ data_temp.loc[data_temp['绿化率'] <= data_temp['绿化率'].quantile(0.15), '绿化率标签'] = '绿化率低'^^ data_temp.loc[data_temp['总户数'] >= data_temp['总户数'].quantile(0.85), '小区规模标签'] = '大型社区'^ data_temp.loc[data_temp['总户数'] <= data_temp['总户数'].quantile(0.15), '小区规模标签'] = '小型社区'^^ # data_temp.loc[data_temp['车位比']>=data_temp['车位比'].quantile(0.85),'停车位标签'] = '停车位充裕'^ # data_temp.loc[data_temp['车位比']<=data_temp['车位比'].quantile(0.15),'停车位标签'] = '停车位紧缺'^^ data_temp.loc[data_temp['建成年份'] >= 2015, '楼龄标签'] = '次新房'^ data_temp.loc[(data_temp['建成年份'] >= 2010) & (data['建成年份'] < 2015), '楼龄标签'] = '6-10年楼龄'^ data_temp.loc[(data_temp['建成年份'] >= 2005) & (data['建成年份'] < 2010), '楼龄标签'] = '10-15年楼龄'^ data_temp.loc[(data_temp['建成年份'] >= 2000) & (data['建成年份'] < 2005), '楼龄标签'] = '15-20年楼龄'^ data_temp.loc[data_temp['建成年份'] < 2000, '楼龄标签'] = '老旧小区'^^ data_temp.loc[data_temp['开发商'].isin(developers), '百强开发商标签'] = '百强开发商'^ data_temp.loc[data_temp['物业公司'].isin(propertys), '百强物业标签'] = '百强物业'^^ data_temp.loc[data_temp['挂牌数量'] >= data_temp['挂牌数量'].quantile(0.85), '活跃度标签'] = '挂盘活跃'^ data_temp.loc[data_temp['挂牌数量'] <= data_temp['挂牌数量'].quantile(0.15), '停车位标签'] = '挂盘不活跃'^^ data_temp.loc[data_temp['购物中心'] < 1000, '购物中心标签'] = '近购物中心'^ data_temp.loc[data_temp['三级甲等医院'] < 1000, '三级甲等医院标签'] = '近三甲医院'^ data_temp.loc[data_temp['政府机关'] < 1000, '政府机关标签'] = '近政府机关'^ data_temp.loc[data_temp['火车站'] < 1000, '火车站标签'] = '近火车站'^ data_temp.loc[data_temp['景区'] < 1000, '景区标签'] = '近景区'^ # data_temp.loc[data_temp['公园']<1000, '公园标签'] = '近公园'^ # data_temp.loc[data_temp['地铁站']<1000, '地铁标签'] = '近地铁站'^ data_temp.loc[data_temp['飞机场'] < 1000, '机场标签'] = '距离机场过近'^ data_temp.loc[data_temp['丧葬设施'] > 5, '丧葬设施标签'] = '距离丧葬设施过近'^ data_temp.loc[data_temp['工厂'] > 3, '工厂标签'] = '距离工厂过近'^ data_temp.loc[data_temp['商务写字楼'] >= data_temp['商务写字楼'].quantile(0.85), '商务区标签'] = '商务区'^^ show_cols = data_temp.columns[data_temp.columns.str.contains('标签')]^ # 计算展示标签^ for idx, row in data_temp.iterrows():^ labels = []^ for col in show_cols:^ if pd.isna(row[col]):^ pass^ else:^ labels.append(row[col])^^ data_temp.loc[idx, '展示标签'] = ','.join(labels)^^ return data_temp^^^def train(data):^ use_cols = [ '容积率',^ '绿化率',^ '占地面积',^ '建成年份',^ '总楼层',^ '城市中心',^ '抗跌率',^ '购物中心',^ '三级甲等医院',^ '高等院校',^ '商务写字楼',^ '公交车站相关',^ '挂牌数量',^ '路口名',^ '商场',^ '工厂',^ '政府机关',^ '丧葬设施',^ '体育馆',^ '公园',^ '地铁站',^ '小学',^ '幼儿园',^ '火车站',^ '小区均价']^^ train_data = data[use_cols]^ X = train_data[train_data['小区均价'].notna()].iloc[:, :-1]^ y = train_data[train_data['小区均价'].notna()].iloc[:, -1]^ model = XGBRegressor(n_jobs=-1)^ param_grid = {'max_depth': np.arange(2, 8, 1),^ 'gamma': np.arange(0.5, 0.8, 0.1),^ 'colsample_bytree': np.arange(0.5, 0.8, 0.1)}^ gs = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, verbose=1, cv=5, scoring='neg_mean_absolute_error')^ gs.fit(X, y)^ print(gs.best_params_)^ print(gs.best_score_)^ X_ = train_data.iloc[:, :-1]^ y_ = gs.predict(X_)^ data.sort_values('综合评分',inplace=True)^ data['综合评分'] = np.linspace(5, 10, data.shape[0])^^^if __name__=='__main__':^^ print('城市: ', city)^ start_time1 = time.time()^# # 获取poi数据^ poi_data = get_poi_data(path_poi)^# # 获取小区清单数据^ read_community_data = get_community_data(path_community)^ read_community_data.dropna(subset=['高德经度'],inplace=True)^^# print('读数时间:',time.time() - start_time)^^ start_time = time.time()^ read_community_data = get_turth_distance(poi_data, read_community_data)^ read_community_data = get_turth_number(poi_data, read_community_data)^ print('poi数据运行时间:',time.time() - start_time)^ ^ start_time = time.time()^ case_value = CaseValue(city)^ result = case_value.result^ read_community_data = pd.merge(read_community_data, result, on='楼盘ID', how='left')^ print('案例数据运行时间:',time.time() - start_time)^^ # TODO: 出标签^ start_time = time.time()^ read_community_data = get_label(read_community_data)^ print('出标签时间:',time.time() - start_time)^^ # TODO: 处理小区清单数据^ start_time = time.time()^ read_community_data=community_data(read_community_data)^ print('全部运行时间:',time.time() - start_time1)^^ # TODO: 原始值出分^ read_community_data = value2score(read_community_data)^ read_community_data.to_excel('训练数据_%s.xlsx'%city)^^ # TODO: 训练模型^ read_community_data = train(read_community_data)^ read_community_data.to_excel('小区画像_%s.xlsx'%city, index=False)^ ^^^import pandas as pd^import numpy as np^from tqdm import tqdm^import os^^^^class CaseValue():^^ def __init__(self, city_name):^ self.city_name = city_name^ self.df = self.read_and_process_data()^ self.data = self.get_info(self.df)^ self.result = self.match_c()^ self.result = self.fill_nan(self.result)^^ def read_and_process_data(self):^ year, month = 2019, 1^ cols_dict = \^ {'bazhuayu_case.case_type': '案例类型',^ 'bazhuayu_case.case_source': '案例来源',^ 'bazhuayu_case.district': '区域',^ 'bazhuayu_case.community_sources': '案例源小区名称',^ 'bazhuayu_case.flr_total_ind': '总楼层',^ 'bazhuayu_case.checked_time': '看房次数',^ 'bazhuayu_case.attention_time': '关注量',^ 'bazhuayu_case.browse_time': '浏览次数',^ 'bazhuayu_case.list_time': '挂牌时间',^ 'bazhuayu_case.transaction_time':'成交时间',^ 'bazhuayu_case.list_totalprice': '挂牌总价(万元)',^ 'bazhuayu_case.list_unitprice': '挂牌单价(元/㎡)',^ 'bazhuayu_case.transaction_price': '成交总价(万元)',^ 'bazhuayu_case.transaction_avg_price': '成交单价(元/㎡)',^ 'bazhuayu_case.community_price': '案例源小区均价(元/㎡)',^ 'bazhuayu_case.transaction_cycle': '成交周期',^ 'bazhuayu_case.price_adjustment_times': '调价次数',^ 'bazhuayu_case.longitude': '经度',^ 'bazhuayu_case.latitude': '纬度'}^^ # TODO: 读取数据^ txt_name = os.listdir('城市八爪鱼数据/%s'%self.city_name)[0]^ df = pd.read_csv('城市八爪鱼数据/%s/%s' %(self.city_name, txt_name), engine='python', encoding='utf-8',^ sep='$', error_bad_lines=False, usecols=cols_dict.keys())^ df.columns = df.columns.map(cols_dict)^^ # 逗号过滤^ df['案例源小区名称'] = df['案例源小区名称'].astype(str)^ df['案例源小区名称'] = df['案例源小区名称'].str.replace(',', '')^^ # TODO: 八爪鱼小区名称映射^ df_community = pd.read_excel('映射表/%s.xlsx'%self.city_name)^ df_community['映射字段'] = df_community['district'] + df_community['community_sources']^^ d = dict(zip(df_community['映射字段'], df_community['man_community_id']))^ df['映射字段'] = df['区域'] + df['案例源小区名称']^ df['匹配ID'] = df['映射字段'].map(d)^ n = df.shape[0]^ n_nan = df[df['匹配ID'].isna()].shape[0]^ print('总数据量:%s, 可映射数据%s, 不可映射数据%s'%(n, n-n_nan, n_nan))^ df = df[df['匹配ID'].notna()]^^ def func(x):^ if pd.isna(x['挂牌时间']):^ if pd.isna(x['成交时间']):^ return np.nan^ else:^ return x['成交时间']^ else:^ return x['挂牌时间']^^ def func2(x):^ if pd.isna(x['成交单价(元/㎡)']) :^ return x['挂牌单价(元/㎡)'] * 0.95^ else:^ return x['成交单价(元/㎡)']^^ df['时间'] = df.apply(func, axis=1)^ df['时间'] = df['时间'].map(lambda x: np.nan if len(str(x)) > 10 else x)^ df['时间']= pd.to_datetime(df['时间'], errors='coerce')^ df = df[df['时间'].notna()]^ df = df[df['时间']>pd.to_datetime('%s-%s-01'%(year, month))]^ df['时间'] = df['时间'].map(lambda x: (x.year, x.month))^^ cols = ['案例源小区均价(元/㎡)', '挂牌总价(万元)', '挂牌单价(元/㎡)', '成交总价(万元)',^ '成交单价(元/㎡)', '总楼层']^ for col in cols:^ df[col] = pd.to_numeric(df[col], errors='coerce')^ df['价格'] = df[['挂牌单价(元/㎡)', '成交单价(元/㎡)']].apply(func2, axis=1)^ df = df[['匹配ID', '案例源小区均价(元/㎡)', '挂牌总价(万元)', '挂牌单价(元/㎡)', '成交总价(万元)',^ '成交单价(元/㎡)', '总楼层','时间','价格']]^ df.drop_duplicates(subset=['匹配ID', '价格'], inplace=True)^ return df^^ def get_info(self, df):^ data = []^ for c_id, group in tqdm(df.groupby(['匹配ID']), desc='挂牌数据计算中'):^^ n_chengjiao = group[group['成交单价(元/㎡)'].notna()].shape[0]^ n_guapai = group[group['挂牌单价(元/㎡)'].notna()].shape[0]^^ n_floor = group['总楼层'].median()^ avg_price = group['价格'].median()^ case_price = group['案例源小区均价(元/㎡)'].median()^ if pd.isna(case_price):^ price = avg_price^ else:^ price = case_price^ a = group.groupby('时间').mean()^ rate = (a['价格'].diff() / a['价格']).mean()^ data.append([c_id, n_chengjiao, n_guapai, n_floor, price, rate])^ data = pd.DataFrame(data, columns=['楼盘ID','成交数量','挂牌数量','总楼层','小区均价','抗跌率'])^ return data^^ def match_c(self):^ # TODO: 读取小区清单^ c_list = pd.read_excel('小区清单/小区清单_高德地址_%s.xlsx'%self.city_name)^ result = pd.merge(c_list[['楼盘ID']], self.data,^ on='楼盘ID', how='left')^ return result^^ def fill_nan(self, result):^ result['成交数量'].fillna(0, inplace=True)^ result['挂牌数量'].fillna(0, inplace=True)^ result['总楼层'].fillna(result['总楼层'].quantile(0.4), inplace=True)^ result['抗跌率'].fillna(result['抗跌率'].quantile(0.4), inplace=True)^ return result^^if '__main__' == '__name__':^ model = CaseValue('深圳市')^ result = model.result^ ^ ^

你可能感兴趣的:(2020-01-13)