#coding=utf/8
#time:2019/8/11
#function:线性回归
#author:Karen
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import preprocessing
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
#解决负号显示问题
from matplotlib import font_manager as fm
#数据预处理及数据探索
def get_data():
f=open('北京高端酒店价格.csv',encoding='utf/8')
data=pd.read_csv(f,encoding='utf/8')
st=''
for i in data['地区'].unique():
st=st+','+i
print(st)
print(data['位置评分'].describe())
data['装修时间']=data['装修时间'].apply(lambda x: '新装修' if x>2015 else '旧装修')
# tim=data['装修时间'].unique()
# typ=data['房间类型'].unique()
# location=data['地区'].unique()
#哑变量
df1=pd.get_dummies(data['房间类型'])
df2=pd.get_dummies(data['地区'])
df3=pd.get_dummies(data['装修时间'])
data=pd.concat([data,df1,df2,df3],axis=1)
data.drop(['房间类型','地区'],inplace=True,axis=1)
#改变索引
price=data['房价']
data.drop('房价',inplace=True,axis=1)
data.insert(0,'房价',price)
# data['新装修']=data.loc[22]
print(data.head())
data.drop(['酒店名称','经度','纬度','地址','其他城区','标准间','装修时间','旧装修'],inplace=True,axis=1)
data['对数房价']=data['房价'].apply(lambda x:math.log(x))
print(data.head())
data=data.round(1)
#数据标准化