knn算法

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

dc_listings = pd.read_csv('E:/test/KNN/listings.csv')   #读取数据集csv文件
features =['accommodates','bedrooms','bathrooms','beds','price','minim`um_nights','maximum_nights','number_of_reviews'] #选择需要的特征
dc_listings = dc_listings[features]   #将需要的特征从数据集中选出来
dc_listings['price'] = dc_listings.price.str.replace("\$|,",'').astype(float)   #将尾部的符号去掉转换成float类型
dc_listings = dc_listings.dropna()   #将数据集中的空值去掉
dc_listings[features] = StandardScaler().fit_transform(dc_listings[features])    #将数据标准化(转换到0-1之间)
normalized_listings = dc_listings
train_data=normalized_listings[0:2792]  #训练集
test_data=normalized_listings[2792:]    #测试集

基于单变量预测价格
def predict_price(new_listings_value,feature_column):          
    temp_df=train_data                                                       #导入训练集
    temp_df['distance']=np.abs(temp_df[feature_column]-new_listings_value)   #求距离
    temp_df=temp_df.sort_values('distance')                                  #对’距离列‘进行排序
    knn_5=temp_df.price.iloc[:5].mean()                                      #求排序后的距离最近的top5的价格
    predict_price=knn_5
    return predict_price

test_data['predicted_price'] = test_data.accommodates.apply(predict_price,feature_column='accommodates')    #利用特征值accommodates预测价格
test_data['squred_error']=(test_data['predicted_price']-test_data['price'])**2   #求均值方差
mse=test_data['squred_error'].mean()
rmse=mse**(1/2)
print(rmse)




from scipy.spatial import distance    #利用scipy工具包中的distance()函数来计算均值方差

#例子
first_listing = normalized_listings.iloc[0][['accommodates', 'bathrooms']]
fifth_listing = normalized_listings.iloc[20][['accommodates', 'bathrooms']]
first_fifth_distance = distance.euclidean(first_listing, fifth_listing)
print(first_fifth_distance)



#多变量KNN模型
def predict_price_multivariate(new_listing_value,feature_columns):
    temp_df = train_data
    temp_df['distance'] = distance.cdist(temp_df[feature_columns],[new_listing_value[feature_columns]]) #求欧氏距离
    temp_df = temp_df.sort_values('distance')
    knn_5 = temp_df.price.iloc[:5]
    predicted_price = knn_5.mean()
    return(predicted_price)

cols = ['accommodates', 'bathrooms']
test_data['predicted_price'] = test_data[cols].apply(predict_price_multivariate,feature_columns=cols,axis=1)    
test_data['squared_error'] = (test_data['predicted_price'] - test_data['price'])**(2)
mse = test_data['squared_error'].mean()
rmse = mse ** (1/2)
print(rmse)


#利用一个工具包就可以又快又方便的计算KNN多变量模型了

from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor   #利用sklearn工具包来完成KNN模型
knn = KNeighborsRegressor()
cols =['accommodates','bedrooms','bathrooms','beds','minimum_nights','maximum_nights','number_of_reviews']
knn.fit(train_data[cols], train_data['price'])
four_features_predictions = knn.predict(test_data[cols])
four_features_mse = mean_squared_error(test_data['price'], four_features_predictions)
four_features_rmse = four_features_mse ** (1/2)
print(four_features_rmse)

你可能感兴趣的:(knn算法)