2020-01-19-linzilu248

# -*- coding: utf-8 -*-
"""
Created on Thu Jan  9 11:36:00 2020

@author: QIAOQICHAO258
"""


import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import codecs
import csv
import os
from math import radians, cos, sin, asin, sqrt
import math
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import mean_absolute_error

cols_dict = \
    {'bazhuayu_case.case_type': '案例类型',
     'bazhuayu_case.case_source': '案例来源',
     'bazhuayu_case.district': '区域',
     'bazhuayu_case.community_sources': '案例源小区名称',
     'bazhuayu_case.flr_total_ind': '总楼层',
     'bazhuayu_case.checked_time': '看房次数',
     'bazhuayu_case.attention_time': '关注量',
     'bazhuayu_case.browse_time': '浏览次数',
     'bazhuayu_case.list_time': '挂牌时间',
     'bazhuayu_case.transaction_time': '成交时间',
     'bazhuayu_case.list_totalprice': '挂牌总价(万元)',
     'bazhuayu_case.list_unitprice': '挂牌单价(元/㎡)',
     'bazhuayu_case.transaction_price': '成交总价(万元)',
     'bazhuayu_case.transaction_avg_price': '成交单价(元/㎡)',
     'bazhuayu_case.community_price': '案例源小区均价(元/㎡)',
     'bazhuayu_case.transaction_cycle': '成交周期',
     'bazhuayu_case.price_adjustment_times': '调价次数',
     'bazhuayu_case.longitude': '经度',
     'bazhuayu_case.latitude': '纬度'}

pi = math.pi
EARTH_REDIUS = 6378.137
# 经度系数 1米所对应的经度
LONGITUDE_COEF = 0.000011
# 纬度系数  1米所对应的度度
LATITUDE_COEF  =  0.000009


######################################  读取数据 ###################################################

# 城市
city = '深圳市'
# poi数据文件
path_poi = '高德POI/%s.csv' %city
with open(path_poi, 'r', encoding='utf-8')as f:
    poi = pd.read_csv(f)
# 小区清单数据文件
path_community = '小区清单/八爪鱼小区清单20191230.csv'
community = pd.read_csv(path_community, engine='python',encoding='utf-8')
community = community[community['城市'] == city]
# 百强开发商物业
developer = pd.read_excel('百强物业开发商/百强开发商.xlsx')
property = pd.read_excel('百强物业开发商/百强物业.xlsx')
# 映射表
yingshe = pd.read_excel('映射表/%s.xlsx'%city)
# 八爪鱼挂牌案例
txt_name = os.listdir('城市八爪鱼数据/%s' % city)[0]
with open('城市八爪鱼数据/%s/%s' % (city, txt_name),'r',encoding='utf-8')as f:
    case_data = pd.read_csv(f,  sep='$', error_bad_lines=False,
                            usecols=cols_dict.keys(), low_memory=True)
# 法拍数据
fapai = pd.read_csv('法拍数据/北京重庆大连哈尔滨_京东阿里法拍.csv', engine='python', encoding='utf8')
####################################################################################################

def mape(estimatory, X, y):
    y_ = estimatory.predict(X)
    return (np.abs(y - y_)/y).mean()

def rad(d):
    return d * pi / 180.0

# 测算两点经纬度之间的距离
def getDistance1(lat1, lng1, lat2, lng2):
    radLat1 = rad(lat1)
    radLat2 = rad(lat2)
    a = radLat1 - radLat2
    b = rad(lng1) - rad(lng2)
    s = 2 * math.asin(math.sqrt(math.pow(math.sin(a/2), 2) + math.cos(radLat1) * math.cos(radLat2) * math.pow(math.sin(b/2), 2)))
    s = s * EARTH_REDIUS
    return s * 1000

def getDistance(lon1, lat1, lon2, lat2):
    #计算距离 

    # 将十进制度数转化为弧度 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) 
    # haversine公式 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6378.137 # 地球平均半径,单位为公里 
    return c * r * 1000


def select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, len_coef):
    

    poi_data_spilt = poi_data[ (temp_comm_longitude + LONGITUDE_COEF * len_coef > poi_data['longitude'] ) & 
                               (poi_data['longitude'] > temp_comm_longitude - LONGITUDE_COEF * len_coef) &
                               (temp_comm_latitude + LONGITUDE_COEF * len_coef > poi_data['latitude'] ) &
                               (poi_data['latitude'] > temp_comm_latitude - LONGITUDE_COEF * len_coef) ]
    return poi_data_spilt


#体育馆最近距离
def get_turth_distance(poi_data_all, read_community_data):
    dict_temp = {
            '体育馆':["['综合体育馆']"],
            '幼儿园':["['幼儿园']"],
            '小学':["['小学']","['中学', '小学']", "['小学', '中学']"],
            '购物中心':["['购物中心']"],
            '三级甲等医院':["['三级甲等医院']"],
            '政府机关':["['区县级政府及事业单位']","['省直辖市级政府及事业单位']"],
            '火车站':["['火车站']"],
            '景区':["['国家级景点']", "['省级景点']"],
            '公园':["['公园']"],
            '高等院校':["['高等院校']"],
            '地铁站': ["['地铁站']"],
            '飞机场':["['飞机场']"],
            '城市中心':["['城市中心']"],
            '城市广场':["['城市广场']"],
    }
    data1=dict_temp.items()
    community_data = read_community_data[['省份', '楼盘名称', '高德经度', '高德纬度']].values
    list_distance_min=[] 
    for key,value in data1:
        poi_data=poi_data_all.copy()
        if key=='政府机关':
            poi_data = poi_data[((poi_data['sub_category']=="['区县级政府及事业单位']")|(poi_data['sub_category']=="['省直辖市级政府及事业单位']"))&(poi_data['name'].str.endswith('人民政府')) ]
        elif key=='小学':
            poi_data=poi_data[((poi_data['sub_category']=="['小学']")|(poi_data['sub_category']=="['中学', '小学']")|(poi_data['sub_category']=="['小学', '中学']"))]
        elif key=='景区':
            poi_data=poi_data[((poi_data['sub_category']=="['国家级景点']")|(poi_data['sub_category']=="['省级景点']"))]
        else:
            print(value)
            poi_data =poi_data[(poi_data['sub_category']==value[0])]
    # 小区清单列表
        list_distance_min=[] 
        print(value, 'poi数据:', len(poi_data), '小区清单列表:', len(read_community_data))
        
        # 遍历小区清单
        for temp_comm in tqdm(community_data):        
            # 经度
            temp_comm_longitude = float(temp_comm[2]) 
            # 纬度
            temp_comm_latitude = float(temp_comm[3]) 
            list_distance_temp=[]
           # 过滤经纬度       
            temp_poi_data = poi_data[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values
    
            for temp_poi in temp_poi_data:
                
                # 经度
                temp_poi_longitude = float(temp_poi[2])
                # 纬度
                temp_poi_latitude = float(temp_poi[3])
    
                # 距离
                distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)
                list_distance_temp.append(distance)
            distance_min=min(list_distance_temp)if len(list_distance_temp)!=0 else 0
            list_distance_min.append(distance_min)
        read_community_data['%s'%(key)]=list_distance_min
    return read_community_data


# 购物中心标签
def get_turth_number(poi_data_all, read_community_data):
    dict_temp = {

            '商务写字楼':["['商务写字楼']"],
            '丧葬设施':["['丧葬设施']"],
            '商场':["['商场']"],
            '路口名':["['路口名']"],
            '公交车站相关':["['公交车站相关']"],
            '工厂':["['工厂']"],
    }
    data1=dict_temp.items()
    community_data = read_community_data[['省份', '楼盘名称', '高德经度', '高德纬度']].values
    for key,value in data1:
        poi_data=poi_data_all.copy()
        print(value)
        poi_data =poi_data[(poi_data['sub_category']==value[0])]
    # 小区清单列表
        list_number=[] 
        print(value, 'poi数据:', len(poi_data), '小区清单列表:', len(read_community_data))
        
        # 遍历小区清单
        for temp_comm in tqdm(community_data):        
            # 经度
            temp_comm_longitude = float(temp_comm[2]) 
            # 纬度
            temp_comm_latitude = float(temp_comm[3]) 
            list_number_temp=[]
            # 过滤经纬度
            poi_data_spilt = select_facter_poi(temp_comm_longitude, temp_comm_latitude, poi_data, 2000)
            temp_poi_data = poi_data_spilt[['mid_category', 'sub_category', 'longitude', 'latitude', 'name']].values
    
            for temp_poi in temp_poi_data:
                
                # 经度
                temp_poi_longitude = float(temp_poi[2])
                # 纬度
                temp_poi_latitude = float(temp_poi[3])

                # 距离
                distance = getDistance(temp_comm_longitude, temp_comm_latitude, temp_poi_longitude, temp_poi_latitude)
                if distance<=1000:
                    list_number_temp.append(distance)
            number=len(list_number_temp)
            list_number.append(number)
        read_community_data['%s'%(key)]=list_number
    return read_community_data


class CaseValue():

    def __init__(self, city_name):
        self.city_name = city_name
        self.df = self.read_and_process_data()
        self.data = self.get_info(self.df)
        self.result = self.match_c()
        self.result = self.fill_nan(self.result)

    def read_and_process_data(self):
        year, month = 2019, 1
        cols_dict = \
            {'bazhuayu_case.case_type': '案例类型',
             'bazhuayu_case.case_source': '案例来源',
             'bazhuayu_case.district': '区域',
             'bazhuayu_case.community_sources': '案例源小区名称',
             'bazhuayu_case.flr_total_ind': '总楼层',
             'bazhuayu_case.checked_time': '看房次数',
             'bazhuayu_case.attention_time': '关注量',
             'bazhuayu_case.browse_time': '浏览次数',
             'bazhuayu_case.list_time': '挂牌时间',
             'bazhuayu_case.transaction_time':'成交时间',
             'bazhuayu_case.list_totalprice': '挂牌总价(万元)',
             'bazhuayu_case.list_unitprice': '挂牌单价(元/㎡)',
             'bazhuayu_case.transaction_price': '成交总价(万元)',
             'bazhuayu_case.transaction_avg_price': '成交单价(元/㎡)',
             'bazhuayu_case.community_price': '案例源小区均价(元/㎡)',
             'bazhuayu_case.transaction_cycle': '成交周期',
             'bazhuayu_case.price_adjustment_times': '调价次数',
             'bazhuayu_case.longitude': '经度',
             'bazhuayu_case.latitude': '纬度'}

        # TODO: 读取数据
        df = case_data.copy()
        df.columns = df.columns.map(cols_dict)

        # 逗号过滤
        df['案例源小区名称'] = df['案例源小区名称'].astype(str)
        df['案例源小区名称'] = df['案例源小区名称'].str.replace(',', '')

        # TODO: 八爪鱼小区名称映射
        df_community = yingshe
        df_community['映射字段'] = df_community['district'] + df_community['community_sources']

        d = dict(zip(df_community['映射字段'], df_community['man_community_id']))
        df['映射字段'] = df['区域'] + df['案例源小区名称']
        df['匹配ID'] = df['映射字段'].map(d)
        n = df.shape[0]
        n_nan = df[df['匹配ID'].isna()].shape[0]
        print('总数据量:%s, 可映射数据%s, 不可映射数据%s'%(n, n-n_nan, n_nan))
        df = df[df['匹配ID'].notna()]

        def func(x):
            if pd.isna(x['挂牌时间']):
                if pd.isna(x['成交时间']):
                    return np.nan
                else:
                    return x['成交时间']
            else:
                return x['挂牌时间']

        def func2(x):
            if pd.isna(x['成交单价(元/㎡)']) :
                return x['挂牌单价(元/㎡)'] * 0.95
            else:
                return x['成交单价(元/㎡)']

        df['时间'] = df.apply(func, axis=1)
        df['时间'] = df['时间'].map(lambda x: np.nan if len(str(x)) > 10 else x)
        df['时间']= pd.to_datetime(df['时间'], errors='coerce')
        df = df[df['时间'].notna()]
        df = df[df['时间']>pd.to_datetime('%s-%s-01'%(year, month))]
        df['时间'] = df['时间'].map(lambda x: (x.year, x.month))

        cols = ['案例源小区均价(元/㎡)', '挂牌总价(万元)', '挂牌单价(元/㎡)', '成交总价(万元)',
                '成交单价(元/㎡)', '总楼层']
        for col in cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df['价格'] = df[['挂牌单价(元/㎡)', '成交单价(元/㎡)']].apply(func2, axis=1)
        df = df[['匹配ID', '案例源小区均价(元/㎡)', '挂牌总价(万元)', '挂牌单价(元/㎡)', '成交总价(万元)',
                '成交单价(元/㎡)', '总楼层','时间','价格']]
        df.drop_duplicates(subset=['匹配ID', '价格'], inplace=True)
        return df

    def get_info(self, df):
        data = []
        for c_id, group in tqdm(df.groupby(['匹配ID']), desc='挂牌数据计算中'):

            n_chengjiao = group[group['成交单价(元/㎡)'].notna()].shape[0]
            n_guapai = group[group['挂牌单价(元/㎡)'].notna()].shape[0]

            n_floor = group['总楼层'].median()
            avg_price = group['价格'].median()
            case_price = group['案例源小区均价(元/㎡)'].median()
            if pd.isna(case_price):
                price = avg_price
                if price < 1000:
                    price = np.nan
            else:
                price = case_price
                if price < 1000:
                    price = np.nan

            a = group.groupby('时间').mean()
            rate = (a['价格'].diff() / a['价格']).mean()
            data.append([c_id, n_chengjiao, n_guapai, n_floor, price, rate])
        data = pd.DataFrame(data, columns=['楼盘ID','成交数量','挂牌数量','总楼层','小区均价','抗跌率'])
        return data

    def match_c(self):
        # TODO: 读取小区清单
        result = pd.merge(community[['楼盘ID']], self.data,
                         on='楼盘ID', how='left')
        return result

    def fill_nan(self, result):
        result['成交数量'].fillna(0, inplace=True)
        result['挂牌数量'].fillna(0, inplace=True)
        result['总楼层'].fillna(result['总楼层'].quantile(0.4), inplace=True)
        result['抗跌率'].fillna(result['抗跌率'].quantile(0.4), inplace=True)
        return result


#处理填充
def community_data(df):
    # 填充房屋类型的空值
    housing_type_fill=str(df['房屋类型(小类)'].mode())
    df.fillna({'房屋类型(小类)':housing_type_fill})
    # 填充绿化率和容积率的空值,根据房屋类型分组填充
    cols=[col for col in df.columns if col in ['绿化率','容积率'] ]
    gp_col='房屋类型(小类)'
    df_na=df[cols].isna()
    df_mean=df.groupby(gp_col)[cols].mean()
    for col in cols:
        na_series=df_na[col]
        names=list(df.loc[na_series,gp_col])    
        t=df_mean.loc[names,col]
        t.index=df.loc[na_series,col].index
        df.loc[na_series,col]=t

    # 填充其他字段的空值
    df['占地面积'] = pd.to_numeric(df['占地面积'], errors='coerce')
    df['地上车位'] = pd.to_numeric(df['地上车位'], errors='coerce')
    df['地下车位'] = pd.to_numeric(df['地下车位'], errors='coerce')
    build_date_fill=int(df['建成年份'].mode())
    green_rate_fill=df['绿化率'].mean()
    plot_rate_fill=df['容积率'].mean()
    floor_area_fill=df['占地面积'].mean()
    house_num_fill=df['总户数'].mean()
    manage_type_fill=str(df['管理形式'].mode())
    
    column_list=df.fillna({'管理形式':manage_type_fill,'占地面积':floor_area_fill,'总户数':house_num_fill,'绿化率':green_rate_fill,
                           '容积率':plot_rate_fill,'建成年份':build_date_fill,'地上车位':0,'地下车位':0})
    
#    result=column_list[['楼盘ID','楼盘名称','行政区','绿化率','容积率','管理形式','地上车位','地下车位','占地面积','建成年份','总户数']]
    column_list['车位比']=column_list.apply(lambda x:(x['地上车位']+x['地下车位'])/x['总户数'],axis=1)
    return column_list


def value2score(data):
    # 1:值越小分数越高  0:值越大分数越高
    usecols = {
        '地铁站': (1, '生活配套'),
        '建成年份': (0, '楼盘品质'),
        '绿化率': (0, '楼盘品质'),
        '容积率': (0, '宜居程度'),
        '体育馆': (1, '宜居程度'),
        '幼儿园': (1, '宜居程度'),
        '总户数': (0, '楼盘品质'),
        '占地面积': (0, '楼盘品质'),
        '小学': (1, '生活配套'),
        '购物中心': (1, '宜居程度'),
        '三级甲等医院': (1, '生活配套'),
        '政府机关': (0, '生活配套'),
        '景区': (1, '宜居程度'),
        '公园': (1, '宜居程度'),
        '高等院校': (1, '区位状况'),
        '城市中心': (1, '区位状况'),
        '城市广场': (0, '区位状况'),
        '商务写字楼': (0, '区位状况'),
        '丧葬设施': (1, '不利因素'),
        '商场': (0, '生活配套'),
        '路口名': (0, '区位状况'),
        '公交车站相关': (0, '区位状况'),
        '工厂': (0, '不利因素'),
        '成交数量': (0, '活跃程度'),
        '挂牌数量': (0, '活跃程度'),
        '总楼层': (0, '楼盘品质'),
        '抗跌率': (0, '活跃程度'),
        '车位比': (0, '楼盘品质'),
        '加油站': (0, '不利因素'),
        '火车站': (0,'不利因素'),
        '机场': (0, '不利因素'),
        '行政区分数':(0, '区位状况'),

    }
    data['建成年份'] = pd.to_numeric(data['建成年份'],errors='coerce')
    data['建成年份'].fillna(data['建成年份'].quantile(0.5),inplace=True)
    # 处理行政区分数
    distruct_value = data[['小区均价','行政区']].groupby('行政区').mean().fillna(0)
    dv_dict = dict(zip(distruct_value.index, distruct_value['小区均价']))
    data['行政区分数'] = data['行政区'].map(dv_dict)
    for col_name, col in data.iteritems():
        if col_name in usecols.keys():

            cat = usecols[col_name][1]
            false = False
            if false:
                pass
            elif false:
                pass
            else:
                t = usecols[col_name][0]
                data.loc[data[col_name] >= data[col_name].quantile(0.75), col_name + '分数'] = t*-4 + 5
                data.loc[(data[col_name] < data[col_name].quantile(0.75)) & (
                        data[col_name] >= data[col_name].quantile(0.50)), col_name + '分数'] = t*-2 + 4
                data.loc[(data[col_name] < data[col_name].quantile(0.50)) & (
                        data[col_name] >= data[col_name].quantile(0.30)), col_name + '分数'] = 3
                data.loc[(data[col_name] < data[col_name].quantile(0.30)) & (
                        data[col_name] >= data[col_name].quantile(0.15)), col_name + '分数'] = t*2 + 2
                data.loc[data[col_name] < data[col_name].quantile(0.15), col_name + '分数'] = t*4 + 1

            if cat in data.columns:
                data[cat] = pd.to_numeric(data[cat], errors='coerce')
                data[cat] += data[col_name + '分数']
            else:
                data[cat] = data[col_name + '分数']
    return data


def get_label(data):

    developers = developer['公司名称'].values
    propertys = property['公司名称'].values

    data_temp = data

    dffb = fapai
    data['法拍数量'] = [len(dffb[dffb['匹配ID'] == ID].drop_duplicates(subset=['匹配ID', 'title'])) for ID in
                    data['楼盘ID'].values]

    data.loc[(data['法拍数量'] > 0) & (data['法拍数量'] < 5), '法拍标签'] = '含法拍'
    data.loc[(data['法拍数量'] >= 5) & (data['法拍数量'] < 10), '法拍标签'] = '法拍数量多'
    data.loc[data['法拍数量'] > 10, '法拍标签'] = '法拍数量极多'

    data_temp['容积率'] = pd.to_numeric(data_temp['容积率'], errors='coerce')
    data_temp.loc[data_temp['容积率'] >= data_temp['容积率'].quantile(0.85), '容积率标签'] = '容积率高'
    data_temp.loc[data_temp['容积率'] <= data_temp['容积率'].quantile(0.15), '容积率标签'] = '容积率低'

    data_temp['绿化率'] = pd.to_numeric(data_temp['绿化率'], errors='coerce')
    data_temp.loc[data_temp['绿化率'] >= data_temp['绿化率'].quantile(0.85), '绿化率标签'] = '绿化率高'
    data_temp.loc[data_temp['绿化率'] <= data_temp['绿化率'].quantile(0.15), '绿化率标签'] = '绿化率低'

    data_temp['总户数'] = pd.to_numeric(data_temp['总户数'], errors='coerce')
    data_temp.loc[data_temp['总户数'] >= data_temp['总户数'].quantile(0.85), '小区规模标签'] = '大型社区'
    data_temp.loc[data_temp['总户数'] <= data_temp['总户数'].quantile(0.15), '小区规模标签'] = '小型社区'

    # data_temp.loc[data_temp['车位比']>=data_temp['车位比'].quantile(0.85),'停车位标签'] =  '停车位充裕'
    # data_temp.loc[data_temp['车位比']<=data_temp['车位比'].quantile(0.15),'停车位标签'] =  '停车位紧缺'

    data_temp['建成年份'] = pd.to_numeric(data_temp['建成年份'], errors='coerce')
    data_temp.loc[data_temp['建成年份'] >= 2015, '楼龄标签'] = '次新房'
    data_temp.loc[(data_temp['建成年份'] >= 2010) & (data_temp['建成年份'] < 2015), '楼龄标签'] = '6-10年楼龄'
    data_temp.loc[(data_temp['建成年份'] >= 2005) & (data_temp['建成年份'] < 2010), '楼龄标签'] = '10-15年楼龄'
    data_temp.loc[(data_temp['建成年份'] >= 2000) & (data_temp['建成年份'] < 2005), '楼龄标签'] = '15-20年楼龄'
    data_temp.loc[data_temp['建成年份'] < 2000, '楼龄标签'] = '老旧小区'

    data_temp.loc[data_temp['开发商'].isin(developers), '百强开发商标签'] = '百强开发商'
    data_temp.loc[data_temp['物业公司'].isin(propertys), '百强物业标签'] = '百强物业'

    data_temp.loc[data_temp['挂牌数量'] >= data_temp['挂牌数量'].quantile(0.85), '活跃度标签'] = '挂盘活跃'
    data_temp.loc[data_temp['挂牌数量'] <= data_temp['挂牌数量'].quantile(0.15), '停车位标签'] = '挂盘不活跃'

    data_temp.loc[data_temp['购物中心'] < 1000, '购物中心标签'] = '近购物中心'
    data_temp.loc[data_temp['三级甲等医院'] < 1000, '三级甲等医院标签'] = '近三甲医院'
    data_temp.loc[data_temp['政府机关'] < 1000, '政府机关标签'] = '近政府机关'
    data_temp.loc[data_temp['火车站'] < 1000, '火车站标签'] = '近火车站'
    data_temp.loc[data_temp['景区'] < 1000, '景区标签'] = '近景区'
    # data_temp.loc[data_temp['公园']<1000, '公园标签'] = '近公园'
    # data_temp.loc[data_temp['地铁站']<1000, '地铁标签'] = '近地铁站'
    data_temp.loc[data_temp['飞机场'] < 1000, '机场标签'] = '距离机场过近'
    data_temp.loc[data_temp['丧葬设施'] > 5, '丧葬设施标签'] = '距离丧葬设施过近'
    data_temp.loc[data_temp['工厂'] > 3, '工厂标签'] = '距离工厂过近'
    data_temp.loc[data_temp['商务写字楼'] >= data_temp['商务写字楼'].quantile(0.85), '商务区标签'] = '商务区'

    show_cols = data_temp.columns[data_temp.columns.str.contains('标签')]
    # 计算展示标签
    for idx, row in data_temp.iterrows():
        labels = []
        for col in show_cols:
            if pd.isna(row[col]):
                pass
            else:
                labels.append(row[col])

        data_temp.loc[idx, '展示标签'] = ','.join(labels)

    return data_temp


def train(data):

    
    use_cols = [ '容积率',
                 '绿化率',
                 '占地面积',
                 '建成年份',
                 '总楼层',
                 '城市中心',
                 '抗跌率',
                 '购物中心',
                 '三级甲等医院',
                 '高等院校',
                 '商务写字楼',
                 '公交车站相关',
                 '挂牌数量',
                 '路口名',
                 '商场',
                 '工厂',
                 '政府机关',
                 '丧葬设施',
                 '体育馆',
                 '公园',
                 '地铁站',
                 '小学',
                 '幼儿园',
                 '火车站',
                 '小区均价']

    train_data = data[use_cols]
    X = train_data[train_data['小区均价'].notna()].iloc[:, :-1]
    y = train_data[train_data['小区均价'].notna()].iloc[:, -1]
    model = XGBRegressor(n_jobs=-1)
    param_grid = {'max_depth': np.arange(2, 8, 1),
                  'gamma': np.arange(0.5, 0.8, 0.1),
                  'colsample_bytree': np.arange(0.5, 0.8, 0.1)}
    gs = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, verbose=1, cv=5, scoring='neg_mean_absolute_error')
    gs.fit(X, y)
    print(gs.best_params_)
    print(gs.best_score_)
    X_ = train_data.iloc[:, :-1]
    y_ = gs.predict(X_)
    data['综合评分'] = y_
    return data


def change_poi(poi_data):
    location = poi_data['location'].str.split(',')
    list_longitude = []
    list_latitude = []
    [(list_longitude.append(x[0]), list_latitude.append(x[1])) for x in location]
    poi_data['longitude'] = list_longitude
    poi_data['latitude'] = list_latitude
    poi_data['longitude'] =poi_data['longitude'].astype(float)
    poi_data['latitude'] =poi_data['latitude'].astype(float)
    return poi_data


def get_level(result):
    result.loc[result['综合评分'] >= result['综合评分'].quantile(0.8), '综合评级'] = 'A'
    result.loc[(result['综合评分'] < result['综合评分'].quantile(0.8)) &
               (result['综合评分'] >= result['综合评分'].quantile(0.6)), '综合评级'] = 'B'
    result.loc[(result['综合评分'] < result['综合评分'].quantile(0.6)) &
                (result['综合评分'] >= result['综合评分'].quantile(0.4)), '综合评级'] = 'C'
    result.loc[(result['综合评分'] < result['综合评分'].quantile(0.4)) &
                (result['综合评分'] >= result['综合评分'].quantile(0.2)), '综合评级'] = 'D'
    result.loc[result['综合评分'] < result['综合评分'].quantile(0.2), '综合评级'] = 'E'

    # TODO: 小区均价 --> 小区价格评级
    result.loc[result['小区均价'] >= result['小区均价'].quantile(0.8), '小区价格评级'] = 'A'
    result.loc[(result['小区均价'] < result['小区均价'].quantile(0.8)) &
               (result['小区均价'] >= result['小区均价'].quantile(0.6)), '小区价格评级'] = 'B'
    result.loc[(result['小区均价'] < result['小区均价'].quantile(0.6)) &
               (result['小区均价'] >= result['小区均价'].quantile(0.4)), '小区价格评级'] = 'C'
    result.loc[(result['小区均价'] < result['小区均价'].quantile(0.4)) &
               (result['小区均价'] >= result['小区均价'].quantile(0.2)), '小区价格评级'] = 'D'
    result.loc[result['小区均价'] < result['小区均价'].quantile(0.2), '小区价格评级'] = 'E'

    # TODO: 填充评级
    for idx, row in result.iterrows():
        if pd.isna(row['小区价格评级']):
            result.loc[idx, '小区价格评级'] = row['综合评级']

    # TODO: 数据重排
    n = result.shape[0]
    for col in ['楼盘品质', '宜居程度', '区位状况',
                '生活配套', '活跃程度', '不利因素','综合评分']:
        result.sort_values(col, inplace=True)
        result[col] = np.linspace(5, 10, n)

    return result


def get_risk(all_community_data):
    list_one = ['A'] * 5 + ['B'] * 5 + ['C'] * 5 + ['D'] * 5 + ['E'] * 5
    list_two = ['A', 'B', 'C', 'D', 'E'] * 5
    list_str = ['正常', '关注', '谨慎', '谨慎', '谨慎', '正常', '正常', '关注', '谨慎', '谨慎', '正常', '正常', '正常', '关注', '谨慎', '正常', '正常',
                '关注', '谨慎', '谨慎', '正常', '关注', '谨慎', '谨慎', '谨慎', ]
    risk_data = pd.DataFrame({'小区价格评级': list_one, '综合评级': list_two, '策略': list_str})
    list_risk = []
    for price, score in all_community_data[['小区价格评级', '综合评级']].values:
        risk_temp = risk_data[(risk_data['小区价格评级'] == price) & (risk_data['综合评级'] == score)]['策略'].values[0]
        list_risk.append(risk_temp)
    all_community_data['风险策略'] = list_risk
    return all_community_data


def high_quality_community(data):
    df = data.copy()
    df = df[df['综合评级'].isin(['C', 'B', 'A'])]
    df = df[df['小区价格评级'].isin(['A', 'B', 'C', 'D'])]
    df = df[df['风险策略'].isin(['正常', '关注'])]
    tmp = df[df['挂牌数量'] > 0]
    stop_n = tmp['挂牌数量'].quantile(0.01)
    df = df[df['挂牌数量'] > stop_n]
    df = df[df['房屋类型(小类)'] == '普通住宅']
    df = df[df['建成年份'] > 2000]
    up = 0.05
    down = 0
    df = df[(np.abs(df['抗跌率']) >= down) & (np.abs(df['抗跌率']) <=up)]
    df = df[df['法拍数量'] == 0]
    data.loc[df.index, '优质小区'] = '是'
    return data


if __name__=='__main__':

    print('城市: ', city)
    start_time1 = time.time()

    # 获取poi数据
    poi_data = change_poi(poi)

    # 获取小区清单数据
    read_community_data = community
    read_community_data.dropna(subset=['高德经度'],inplace=True)

    # TODO: 解析POI数据
    start_time = time.time()
    read_community_data = get_turth_distance(poi_data, read_community_data)
    read_community_data = get_turth_number(poi_data, read_community_data)
    print('poi数据运行时间:',time.time() - start_time)

    # TODO: 解析案例数据
    start_time = time.time()
    case_value = CaseValue(city)
    result = case_value.result
    read_community_data = pd.merge(read_community_data, result, on='楼盘ID', how='left')
    print('案例数据运行时间:',time.time() - start_time)

    # TODO: 出标签
    start_time = time.time()
    read_community_data = get_label(read_community_data)
    print('出标签时间:',time.time() - start_time)

    # TODO: 处理小区清单数据
    start_time = time.time()
    read_community_data=community_data(read_community_data)
    print('全部运行时间:',time.time() - start_time1)

    # TODO: 原始值出分
    read_community_data = value2score(read_community_data)
    # read_community_data.to_excel('训练数据_%s.xlsx'%city)

    # TODO: 训练模型
    read_community_data = train(read_community_data)


    # TODO: 小区评级
    result = get_level(read_community_data)



    # TODO: 风险策略
    result = get_risk(result)


    # TODO: 添加优质小区
    result = high_quality_community(result)

    result.to_excel('小区画像_%s.xlsx'%city, index=False)
    result_cols = [
        '省份','城市','城市代号','行政区','行政区代号','楼盘名称','楼盘ID','地址',
        '小区均价','小区价格评级','综合评级','综合评分','风险策略','展示标签','楼盘品质',
        '宜居程度','区位状况','生活配套','活跃程度','不利因素','优质小区']

    result = result[result_cols]
    result.to_excel('小区评级结果表_%s.xlsx'%city,index=False)





你可能感兴趣的:(2020-01-19-linzilu248)