06-basic k-nearest neighbors + pandas取值技巧

获取data

data参见这里
RMSE & MSE的对比

import pandas as pd
dc_listings = pd.read_csv('dc_airbnb.csv')
#读取第一行，.iloc[0]
print(dc_listings.iloc[0])
#选择某一列某一行的值：
print (dc_listings["col"].iloc[0])

例子：我们有一个三个卧室的房子要租出去，判断租金应该定多少？
方法：在网上找和我们的房子类似的房子，看这些房子的平均租金，然后我们定这个租金。
热身：计算dc_listing里卧室那一列第一行的值，和3个卧室的欧几里得距离：

import math
a = dc_listings["accommodates"].iloc[0] #第一行的值
diff = (3-a)**2
first_distance = math.sqrt(diff) # 欧几里得距离
# 由于只有一个值，也可以直接用np.abs()来计算绝对值
print (first_distance)

对一整列的每个值都进行计算，可以用df["column"].apply()

# 对这一列的每个值，都计算和3的距离是多少
dc_listings["distance"] = dc_listings["accommodates"].apply(lambda x: np.abs(x-3))  
print (dc_listings["distance"].value_counts())

将dataframe的index打乱，打乱之后重新取值，取price

import numpy as np
np.random.seed(1)
#打乱index，相当于: shuffled_index = np.random.permutation(len(df))
# df = df.loc[shuffled_index]
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
dc_listings = dc_listings.sort_values('distance')
print(dc_listings.iloc[0:10]['price'])

对price这一列的值进行处理，去掉", $": df['col'].str.replace(",","")
将结果变成float形式: series.astype('float')
选前5行的平均值：df["col"].iloc[0:5].mean()

stripped_commas = dc_listings["price"].str.replace(",", "")
stripped_commas = stripped_commas.str.replace("$","")
dc_listings["price"] = stripped_commas.astype('float')
mean_price = dc_listings["price"].iloc[0:5].mean()
print (mean_price)

把以上这些步骤写成一个小方程，来对不同的accommodate的平均价格做判断

# Brought along the changes we made to the `dc_listings` Dataframe.
import numpy as np
dc_listings = pd.read_csv('dc_airbnb.csv')
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]

def predict_price(new_listing):
    ## Complete the function.
    temp_df = dc_listings
    temp_df["distance"] = temp_df["accommodates"].apply(lambda x: np.abs(int(new_listing) - x))
    temp_df = temp_df.sort_values("distance", ascending = True)
    price = temp_df["price"].iloc[0:5].mean()
    return(price)

acc_one = predict_price(1)
acc_two = predict_price(2)
acc_four = predict_price(4)

Cross validation

把数据集分为train和test两组，假设test那一组里面，accommodate列的第一个值是5，就用5去和train那一组中accommodate列的每一行的值去比较，算距离，然后按distance排序，选前五个price的平均值，作为predicted price，放到test表里的price列。

import pandas as pd
import numpy as np

dc_listings = pd.read_csv("dc_airbnb.csv")
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')

train_df = dc_listings.iloc[0:2792]
test_df = dc_listings.iloc[2792:]

def predict_price(new_listing):
    temp_df = train_df  #用train_df表，来计算距离
    temp_df['distance'] = temp_df['accommodates'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbor_prices = temp_df.iloc[0:5]['price']
    predicted_price = nearest_neighbor_prices.mean()
    return(predicted_price)

test_df["predicted_price"] = test_df["accommodates"].apply(lambda x: predict_price(x))

检验预测值是否准确

方法一：计算mean absolute error
mae = (|预测值1 - 实际值1| + |预测值2-实际值2| + ... + |预测值n - 实际值n|) / n

mae = np.abs(test_df["predicted_price"] - test_df["price"]).mean()

方法二：计算mean squared error
mse = ((预测值1-实际值1）^2 + (预测值2-实际值2)^2 + ...) / n

test_df["sm"] = (test_df["predicted_price"] - test_df["price"])**2
mse = test_df["sm"].mean()
#如果写成一行，就是：
mse = ((test_df["predicted_price"] - test_df["price"])**(2)).mean()

以上模型，是依靠 "accommodate"这一个变量出发建立的模型，需要其他的模型进行对比，来判断mse是高还是低

用test_df["bathroom"]来建立一个新的模型，计算mse

train_df = dc_listings.iloc[0:2792]
test_df = dc_listings.iloc[2792:]

def predict_price(new_listing):
    temp_df = train_df
    temp_df['distance'] = temp_df['bathrooms'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbors_prices = temp_df.iloc[0:5]['price']
    predicted_price = nearest_neighbors_prices.mean()
    return(predicted_price)

test_df["predicted_price"]= test_df["bathrooms"].apply(lambda x: predict_price(x))
test_df["squared_error"] = (test_df["predicted_price"] - test_df["price"])**2
mse = test_df["squared_error"].mean()
print (mse)

rmse = np.sqrt(mse) # root of mean squared error
print (rmse)

选更多的参数，提高模型准确性

先看dc_listings表里面有哪些字段，有哪些为空
dc_listings.info()

import pandas as pd
import numpy as np
np.random.seed(1)

dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')

print (dc_listings.head())

dc_listings.info()

把一些与living space无关，或者难以直接比较距离的字段去掉

drop_columns = ['room_type', 'city', 'state', 'latitude', 'longitude', 'zipcode', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count']
dc_listings = dc_listings.drop(drop_columns, axis=1)
print(dc_listings.isnull().sum())

把数据标准化
可以直接对整个df表做标准化处理，再把price一列换成原来的price值

normalized_listings = (dc_listings - dc_listings.mean())/(dc_listings.std())
normalized_listings['price'] = dc_listings['price']
print(normalized_listings.head(3))

Euclidean Distance

spicy里面有直接的公式可以计算两行之间的euclidean distance
distance.euclidean(first_listing, fifth_listing)

from scipy.spatial import distance
first_listing = normalized_listings.iloc[0][['accommodates', 'bathrooms']]
fifth_listing = normalized_listings.iloc[4][['accommodates', 'bathrooms']]
first_fifth_distance = distance.euclidean(first_listing, fifth_listing)
print(first_fifth_distance)

Nearest Neighbors

from sklearn.neighbors import KNeighborsRegressor

train_df = normalized_listings.iloc[0:2792]
test_df = normalized_listings.iloc[2792:]
train_columns = ['accommodates', 'bathrooms']

# Instantiate ML model.
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')

# Fit model to data.
knn.fit(train_df[train_columns], train_df['price'])

# Use model to make predictions.
predictions = knn.predict(test_df[train_columns])

计算mean squared error以及root of mean squared error

from sklearn.metrics import mean_squared_error

train_columns = ['accommodates', 'bathrooms']
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute', metric='euclidean')
knn.fit(train_df[train_columns], train_df['price'])
predictions = knn.predict(test_df[train_columns])

two_features_mse = mean_squared_error(test_df["price"], predictions)

two_features_rmse = np.sqrt(two_features_mse)

print (two_features_mse, two_features_rmse)

用4个变量来训练模型

features = ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')


# Fit model to data.
knn.fit(train_df[features], train_df['price'])

# Use model to make predictions.
four_predictions = knn.predict(test_df[features])

from sklearn.metrics import mean_squared_error

four_mse = mean_squared_error(four_predictions, test_df['price'])

four_rmse = np.sqrt(four_mse)

print (four_mse, four_rmse)

Hyper_params

根据不同的k，计算k-neighbors

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

hyper_params = [1, 2, 3, 4, 5]

mse_values = []

for item in hyper_params:
    features = ["accommodates", "bedrooms", "bathrooms", "number_of_reviews"]
    knn = KNeighborsRegressor(n_neighbors=item, algorithm='brute')
    knn.fit(train_df[features], train_df['price'])
    predictions = knn.predict(test_df[features])
    mse_values = mean_squared_error(test_df['price'], predictions)
    print (mse_values)

用range(1,21)将hyper params进一步扩大，进行计算

features = ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']

hyper_params = [x for x in range(1, 21)]

mse_values = list()

for hp in hyper_params:
    knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
    knn.fit(train_df[features], train_df['price'])
    predictions = knn.predict(test_df[features])
    mse = mean_squared_error(test_df['price'], predictions)
    mse_values.append(mse)
print(mse_values)
#绘制散点图
plt.scatter(hyper_params, mse_values)
plt.show()

再尝试一次，把除了price的列作为features，进行训练

hyper_params = [x for x in range(1,21)]
mse_values = list()
# 列名，去除price项
features = train_df.columns.tolist()
features.remove('price')

for hp in hyper_params:
    knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
    knn.fit(train_df[features], train_df['price'])
    predictions = knn.predict(test_df[features])
    mse = mean_squared_error(test_df['price'], predictions)
    mse_values.append(mse)

plt.scatter(hyper_params, mse_values)
plt.show()

寻找最小的mse

two_features = ['accommodates', 'bathrooms']
three_features = ['accommodates', 'bathrooms', 'bedrooms']
hyper_params = [x for x in range(1,21)]
# Append the first model's MSE values to this list.
two_mse_values = list()
# Append the second model's MSE values to this list.
three_mse_values = list()
two_hyp_mse = dict()
three_hyp_mse = dict()
for hp in hyper_params:
    knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
    knn.fit(train_df[two_features], train_df['price'])
    predictions = knn.predict(test_df[two_features])
    mse = mean_squared_error(test_df['price'], predictions)
    two_mse_values.append(mse)

#寻找最小的mse
two_lowest_mse = two_mse_values[0]
two_lowest_k = 1

for k,mse in enumerate(two_mse_values):
    if mse < two_lowest_mse:
        two_lowest_mse = mse
        two_lowest_k = k + 1
    
for hp in hyper_params:
    knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
    knn.fit(train_df[three_features], train_df['price'])
    predictions = knn.predict(test_df[three_features])
    mse = mean_squared_error(test_df['price'], predictions)
    three_mse_values.append(mse)
    
three_lowest_mse = three_mse_values[0]
three_lowest_k = 1

for k,mse in enumerate(three_mse_values):
    if mse < three_lowest_mse:
        three_lowest_mse = mse
        three_lowest_k = k + 1

two_hyp_mse[two_lowest_k] = two_lowest_mse
three_hyp_mse[three_lowest_k] = three_lowest_mse

print(two_hyp_mse)
print(three_hyp_mse)

重新做一遍 - a new project

import numpy as np
import pandas as pd

dc_listings = pd.read_csv("dc_airbnb.csv")
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')

shuffled_index = np.random.permutation(len(dc_listings))

split_one = dc_listings.loc[shuffled_index][0:1862]
split_two = dc_listings.loc[shuffled_index][1862:len(dc_listings)]

Holdout Validation

把模型分为50% vs 50%，分别train和test

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

train_one = split_one
test_one = split_two
train_two = split_two
test_two = split_one
# First half
model = KNeighborsRegressor()
model.fit(train_one[["accommodates"]], train_one["price"])
test_one["predicted_price"] = model.predict(test_one[["accommodates"]])
iteration_one_rmse = mean_squared_error(test_one["price"], test_one["predicted_price"])**(1/2)

# Second half
model.fit(train_two[["accommodates"]], train_two["price"])
test_two["predicted_price"] = model.predict(test_two[["accommodates"]])
iteration_two_rmse = mean_squared_error(test_two["price"], test_two["predicted_price"])**(1/2)

avg_rmse = np.mean([iteration_two_rmse, iteration_one_rmse])

print(iteration_one_rmse, iteration_two_rmse, avg_rmse)

K-fold Validation

重新建一列，把dataframe分为5个部分，做K-fold Validation

# 按照index给新的列赋值的方式
dc_listings.set_value(dc_listings.index[0:744], "fold", 1)
dc_listings.set_value(dc_listings.index[744:1488], "fold", 2)
dc_listings.set_value(dc_listings.index[1488:2232], "fold", 3)
dc_listings.set_value(dc_listings.index[2232:2976], "fold", 4)
dc_listings.set_value(dc_listings.index[2976:3723], "fold", 5)

print (dc_listings["fold"].value_counts())

重新进行K-fold Validation
按照fold=1和fold=2~5进行划分test和train

# Training

model = KNeighborsRegressor()
train_iteration_one = dc_listings[dc_listings["fold"] != 1]
print (train_iteration_one)

test_iteration_one = dc_listings[dc_listings["fold"] == 1]
model.fit(train_iteration_one[["accommodates"]], train_iteration_one["price"])

# Predicting
labels = model.predict(test_iteration_one[["accommodates"]])
test_iteration_one["predicted_price"] = labels
iteration_one_mse = mean_squared_error(test_iteration_one["price"], test_iteration_one["predicted_price"])
iteration_one_rmse = iteration_one_mse ** (1/2)

做一个方程，轮流计算kfolds

# Use np.mean to calculate the mean.
import numpy as np
fold_ids = [1,2,3,4,5]

def train_and_validate(df, folds):
    rmses = []
    for item in folds:
        train = df[df["fold"]!=item]
        test = df[df["fold"] == item]
        knn = KNeighborsRegressor()
        knn.fit(train[["accommodates"]], train["price"])
        predictions = knn.predict(test[["accommodates"]])
        rmse = (mean_squared_error(test["price"], predictions))**0.5
        rmses.append(rmse)
    return rmses

rmses = train_and_validate(dc_listings, fold_ids)
avg_rmse = np.mean(rmses)
print (rmses, avg_rmse)