获取data
- data参见这里
- RMSE & MSE的对比
import pandas as pd
dc_listings = pd.read_csv('dc_airbnb.csv')
#读取第一行,.iloc[0]
print(dc_listings.iloc[0])
#选择某一列某一行的值:
print (dc_listings["col"].iloc[0])
- 例子:我们有一个三个卧室的房子要租出去,判断租金应该定多少?
- 方法:在网上找和我们的房子类似的房子,看这些房子的平均租金,然后我们定这个租金。
- 热身:计算dc_listing里卧室那一列第一行的值,和3个卧室的欧几里得距离:
import math
a = dc_listings["accommodates"].iloc[0] #第一行的值
diff = (3-a)**2
first_distance = math.sqrt(diff) # 欧几里得距离
# 由于只有一个值,也可以直接用np.abs()来计算绝对值
print (first_distance)
对一整列的每个值都进行计算,可以用df["column"].apply()
# 对这一列的每个值,都计算和3的距离是多少
dc_listings["distance"] = dc_listings["accommodates"].apply(lambda x: np.abs(x-3))
print (dc_listings["distance"].value_counts())
将dataframe的index打乱,打乱之后重新取值,取price
import numpy as np
np.random.seed(1)
#打乱index,相当于: shuffled_index = np.random.permutation(len(df))
# df = df.loc[shuffled_index]
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
dc_listings = dc_listings.sort_values('distance')
print(dc_listings.iloc[0:10]['price'])
- 对price这一列的值进行处理,去掉", $": df['col'].str.replace(",","")
- 将结果变成float形式: series.astype('float')
- 选前5行的平均值:df["col"].iloc[0:5].mean()
stripped_commas = dc_listings["price"].str.replace(",", "")
stripped_commas = stripped_commas.str.replace("$","")
dc_listings["price"] = stripped_commas.astype('float')
mean_price = dc_listings["price"].iloc[0:5].mean()
print (mean_price)
- 把以上这些步骤写成一个小方程,来对不同的accommodate的平均价格做判断
# Brought along the changes we made to the `dc_listings` Dataframe.
import numpy as np
dc_listings = pd.read_csv('dc_airbnb.csv')
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
def predict_price(new_listing):
## Complete the function.
temp_df = dc_listings
temp_df["distance"] = temp_df["accommodates"].apply(lambda x: np.abs(int(new_listing) - x))
temp_df = temp_df.sort_values("distance", ascending = True)
price = temp_df["price"].iloc[0:5].mean()
return(price)
acc_one = predict_price(1)
acc_two = predict_price(2)
acc_four = predict_price(4)
Cross validation
把数据集分为train和test两组,假设test那一组里面,accommodate列的第一个值是5,就用5去和train那一组中accommodate列的每一行的值去比较,算距离,然后按distance排序,选前五个price的平均值,作为predicted price,放到test表里的price列。
import pandas as pd
import numpy as np
dc_listings = pd.read_csv("dc_airbnb.csv")
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
train_df = dc_listings.iloc[0:2792]
test_df = dc_listings.iloc[2792:]
def predict_price(new_listing):
temp_df = train_df #用train_df表,来计算距离
temp_df['distance'] = temp_df['accommodates'].apply(lambda x: np.abs(x - new_listing))
temp_df = temp_df.sort_values('distance')
nearest_neighbor_prices = temp_df.iloc[0:5]['price']
predicted_price = nearest_neighbor_prices.mean()
return(predicted_price)
test_df["predicted_price"] = test_df["accommodates"].apply(lambda x: predict_price(x))
检验预测值是否准确
- 方法一:计算mean absolute error
mae = (|预测值1 - 实际值1| + |预测值2-实际值2| + ... + |预测值n - 实际值n|) / n
mae = np.abs(test_df["predicted_price"] - test_df["price"]).mean()
- 方法二:计算mean squared error
mse = ((预测值1-实际值1)^2 + (预测值2-实际值2)^2 + ...) / n
test_df["sm"] = (test_df["predicted_price"] - test_df["price"])**2
mse = test_df["sm"].mean()
#如果写成一行,就是:
mse = ((test_df["predicted_price"] - test_df["price"])**(2)).mean()
以上模型,是依靠 "accommodate"这一个变量出发建立的模型,需要其他的模型进行对比,来判断mse是高还是低
- 用test_df["bathroom"]来建立一个新的模型,计算mse
train_df = dc_listings.iloc[0:2792]
test_df = dc_listings.iloc[2792:]
def predict_price(new_listing):
temp_df = train_df
temp_df['distance'] = temp_df['bathrooms'].apply(lambda x: np.abs(x - new_listing))
temp_df = temp_df.sort_values('distance')
nearest_neighbors_prices = temp_df.iloc[0:5]['price']
predicted_price = nearest_neighbors_prices.mean()
return(predicted_price)
test_df["predicted_price"]= test_df["bathrooms"].apply(lambda x: predict_price(x))
test_df["squared_error"] = (test_df["predicted_price"] - test_df["price"])**2
mse = test_df["squared_error"].mean()
print (mse)
rmse = np.sqrt(mse) # root of mean squared error
print (rmse)
选更多的参数,提高模型准确性
- 先看dc_listings表里面有哪些字段,有哪些为空
dc_listings.info()
import pandas as pd
import numpy as np
np.random.seed(1)
dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
print (dc_listings.head())
dc_listings.info()
- 把一些与living space无关,或者难以直接比较距离的字段去掉
drop_columns = ['room_type', 'city', 'state', 'latitude', 'longitude', 'zipcode', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count']
dc_listings = dc_listings.drop(drop_columns, axis=1)
print(dc_listings.isnull().sum())
- 把数据标准化
- 可以直接对整个df表做标准化处理,再把price一列换成原来的price值
normalized_listings = (dc_listings - dc_listings.mean())/(dc_listings.std())
normalized_listings['price'] = dc_listings['price']
print(normalized_listings.head(3))
Euclidean Distance
- spicy里面有直接的公式可以计算两行之间的euclidean distance
distance.euclidean(first_listing, fifth_listing)
from scipy.spatial import distance
first_listing = normalized_listings.iloc[0][['accommodates', 'bathrooms']]
fifth_listing = normalized_listings.iloc[4][['accommodates', 'bathrooms']]
first_fifth_distance = distance.euclidean(first_listing, fifth_listing)
print(first_fifth_distance)
- Nearest Neighbors
from sklearn.neighbors import KNeighborsRegressor
train_df = normalized_listings.iloc[0:2792]
test_df = normalized_listings.iloc[2792:]
train_columns = ['accommodates', 'bathrooms']
# Instantiate ML model.
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
# Fit model to data.
knn.fit(train_df[train_columns], train_df['price'])
# Use model to make predictions.
predictions = knn.predict(test_df[train_columns])
- 计算mean squared error以及root of mean squared error
from sklearn.metrics import mean_squared_error
train_columns = ['accommodates', 'bathrooms']
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute', metric='euclidean')
knn.fit(train_df[train_columns], train_df['price'])
predictions = knn.predict(test_df[train_columns])
two_features_mse = mean_squared_error(test_df["price"], predictions)
two_features_rmse = np.sqrt(two_features_mse)
print (two_features_mse, two_features_rmse)
- 用4个变量来训练模型
features = ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
# Fit model to data.
knn.fit(train_df[features], train_df['price'])
# Use model to make predictions.
four_predictions = knn.predict(test_df[features])
from sklearn.metrics import mean_squared_error
four_mse = mean_squared_error(four_predictions, test_df['price'])
four_rmse = np.sqrt(four_mse)
print (four_mse, four_rmse)
Hyper_params
- 根据不同的k,计算k-neighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
hyper_params = [1, 2, 3, 4, 5]
mse_values = []
for item in hyper_params:
features = ["accommodates", "bedrooms", "bathrooms", "number_of_reviews"]
knn = KNeighborsRegressor(n_neighbors=item, algorithm='brute')
knn.fit(train_df[features], train_df['price'])
predictions = knn.predict(test_df[features])
mse_values = mean_squared_error(test_df['price'], predictions)
print (mse_values)
用range(1,21)将hyper params进一步扩大,进行计算
features = ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']
hyper_params = [x for x in range(1, 21)]
mse_values = list()
for hp in hyper_params:
knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
knn.fit(train_df[features], train_df['price'])
predictions = knn.predict(test_df[features])
mse = mean_squared_error(test_df['price'], predictions)
mse_values.append(mse)
print(mse_values)
#绘制散点图
plt.scatter(hyper_params, mse_values)
plt.show()
再尝试一次,把除了price的列作为features,进行训练
hyper_params = [x for x in range(1,21)]
mse_values = list()
# 列名,去除price项
features = train_df.columns.tolist()
features.remove('price')
for hp in hyper_params:
knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
knn.fit(train_df[features], train_df['price'])
predictions = knn.predict(test_df[features])
mse = mean_squared_error(test_df['price'], predictions)
mse_values.append(mse)
plt.scatter(hyper_params, mse_values)
plt.show()
寻找最小的mse
two_features = ['accommodates', 'bathrooms']
three_features = ['accommodates', 'bathrooms', 'bedrooms']
hyper_params = [x for x in range(1,21)]
# Append the first model's MSE values to this list.
two_mse_values = list()
# Append the second model's MSE values to this list.
three_mse_values = list()
two_hyp_mse = dict()
three_hyp_mse = dict()
for hp in hyper_params:
knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
knn.fit(train_df[two_features], train_df['price'])
predictions = knn.predict(test_df[two_features])
mse = mean_squared_error(test_df['price'], predictions)
two_mse_values.append(mse)
#寻找最小的mse
two_lowest_mse = two_mse_values[0]
two_lowest_k = 1
for k,mse in enumerate(two_mse_values):
if mse < two_lowest_mse:
two_lowest_mse = mse
two_lowest_k = k + 1
for hp in hyper_params:
knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
knn.fit(train_df[three_features], train_df['price'])
predictions = knn.predict(test_df[three_features])
mse = mean_squared_error(test_df['price'], predictions)
three_mse_values.append(mse)
three_lowest_mse = three_mse_values[0]
three_lowest_k = 1
for k,mse in enumerate(three_mse_values):
if mse < three_lowest_mse:
three_lowest_mse = mse
three_lowest_k = k + 1
two_hyp_mse[two_lowest_k] = two_lowest_mse
three_hyp_mse[three_lowest_k] = three_lowest_mse
print(two_hyp_mse)
print(three_hyp_mse)
重新做一遍 - a new project
import numpy as np
import pandas as pd
dc_listings = pd.read_csv("dc_airbnb.csv")
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
shuffled_index = np.random.permutation(len(dc_listings))
split_one = dc_listings.loc[shuffled_index][0:1862]
split_two = dc_listings.loc[shuffled_index][1862:len(dc_listings)]
Holdout Validation
- 把模型分为50% vs 50%,分别train和test
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
train_one = split_one
test_one = split_two
train_two = split_two
test_two = split_one
# First half
model = KNeighborsRegressor()
model.fit(train_one[["accommodates"]], train_one["price"])
test_one["predicted_price"] = model.predict(test_one[["accommodates"]])
iteration_one_rmse = mean_squared_error(test_one["price"], test_one["predicted_price"])**(1/2)
# Second half
model.fit(train_two[["accommodates"]], train_two["price"])
test_two["predicted_price"] = model.predict(test_two[["accommodates"]])
iteration_two_rmse = mean_squared_error(test_two["price"], test_two["predicted_price"])**(1/2)
avg_rmse = np.mean([iteration_two_rmse, iteration_one_rmse])
print(iteration_one_rmse, iteration_two_rmse, avg_rmse)
K-fold Validation
- 重新建一列,把dataframe分为5个部分,做K-fold Validation
# 按照index给新的列赋值的方式
dc_listings.set_value(dc_listings.index[0:744], "fold", 1)
dc_listings.set_value(dc_listings.index[744:1488], "fold", 2)
dc_listings.set_value(dc_listings.index[1488:2232], "fold", 3)
dc_listings.set_value(dc_listings.index[2232:2976], "fold", 4)
dc_listings.set_value(dc_listings.index[2976:3723], "fold", 5)
print (dc_listings["fold"].value_counts())
- 重新进行K-fold Validation
- 按照fold=1和fold=2~5进行划分test和train
# Training
model = KNeighborsRegressor()
train_iteration_one = dc_listings[dc_listings["fold"] != 1]
print (train_iteration_one)
test_iteration_one = dc_listings[dc_listings["fold"] == 1]
model.fit(train_iteration_one[["accommodates"]], train_iteration_one["price"])
# Predicting
labels = model.predict(test_iteration_one[["accommodates"]])
test_iteration_one["predicted_price"] = labels
iteration_one_mse = mean_squared_error(test_iteration_one["price"], test_iteration_one["predicted_price"])
iteration_one_rmse = iteration_one_mse ** (1/2)
做一个方程,轮流计算kfolds
# Use np.mean to calculate the mean.
import numpy as np
fold_ids = [1,2,3,4,5]
def train_and_validate(df, folds):
rmses = []
for item in folds:
train = df[df["fold"]!=item]
test = df[df["fold"] == item]
knn = KNeighborsRegressor()
knn.fit(train[["accommodates"]], train["price"])
predictions = knn.predict(test[["accommodates"]])
rmse = (mean_squared_error(test["price"], predictions))**0.5
rmses.append(rmse)
return rmses
rmses = train_and_validate(dc_listings, fold_ids)
avg_rmse = np.mean(rmses)
print (rmses, avg_rmse)