利用kd_tree算法快速实现:
1、按范围搜索:可采用RadiusNeighborsClassifier;
2、按数量搜索:可采用KNeighborsClassifier;
针对以下点位a,点位b;找出点位a的最近站点。
实现代码如下:
from sklearn.neighbors import KNeighborsClassifier as Knn
from sklearn.neighbors import RadiusNeighborsClassifier as Rnn
from math import *
import pandas as pd
def calculate_distance(lon1, lat1, lon2, lat2):
lon1, lat1, lon2, lat2 = map(radians, [float(lon1), float(lat1), float(lon2), float(lat2)])
d_lon, d_lat = lon2 - lon1, lat2 - lat1
am = sin(d_lat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(d_lon / 2) ** 2
distance = 2 * asin(sqrt(am)) * 6371229
return float('%.2f' % distance)
#找出最近N个
def run_kd_tree():
aa = pd.read_excel(path_in, sheet_name='基站a')
aa_x = aa[['经度', '纬度']]
bb = pd.read_excel(path_in, sheet_name='基站b')
bb_x = bb[['经度', '纬度']]
# y标注每条数据属于哪个类别,在这里哪种类型并不重要,所以不需要使用KNN的分类功能分类,全部标注为类别1即可
y = [1] * len(bb_x)
# 针对小于7维的低维数据算法采用kd_tree即可,但针对高维数据则建议采用ball_tree算法。
knn = Knn(algorithm='kd_tree', metric='minkowski', p=2).fit(bb_x, y)
# 最近的3个,n_neighbors=3
distance_, points_ = knn.kneighbors(aa_x, n_neighbors=3, return_distance=True)
result = pd.DataFrame()
for index_value, row_value in aa.iterrows():
a = pd.DataFrame(row_value).T
a["nearest_point"] = index_value
b = bb.iloc[points_[index_value]]
b = pd.DataFrame(b)
b["nearest_point"] = index_value
merge_excel = pd.merge(a, b, on="nearest_point", how="inner")
result = result.append(merge_excel)
result['distance_metre'] = result.apply(
lambda x: calculate_distance(x.经度_x, x.纬度_x, x.经度_y, x.纬度_y), axis=1)
result = result.drop(labels=["nearest_point"], axis=1)
result.to_excel(r'D:\xxx经纬度信息表-结果-最近n个点.xlsx', index=None)
#找出距离范围内的所有点位
def run_rd_tree():
aa = pd.read_excel(path_in, sheet_name='基站a')
aa_x = aa[['经度', '纬度']]
bb = pd.read_excel(path_in, sheet_name='基站b')
bb_x = bb[['经度', '纬度']]
y = [1] * len(bb_x)
#此处需要使用自定义的距离计算方法;好处是可以直接指定100米或者300米,但代价是调用外部算法速度会稍慢;
#如果采用内部算法(例如minkowski),速度快但无法直接指定距离,因为不同的方法对距离的度量含义不同。
rnn = Rnn(algorithm='ball_tree', metric=lambda s1, s2: calculate_distance(*s1, *s2)).fit(bb_x, y)
distance_, points_ = rnn.radius_neighbors(aa_x, radius=300, return_distance=True)
# 上面结果已出来了,需要如下骚操作整理出结果。
result = pd.DataFrame()
for index_value, row_value in aa.iterrows():
a = pd.DataFrame(row_value).T
a["nearest_point"] = index_value
b = bb.iloc[points_[index_value]]
b = pd.DataFrame(b)
b["distance_metre"] = distance_[index_value]
b["nearest_point"] = index_value
merge_excel = pd.merge(a, b, on="nearest_point", how="inner")
merge_excel = merge_excel.sort_values(["distance_metre"], ascending=[1])
result = result.append(merge_excel)
result['similarity'] = result.apply(
lambda x: calculate_dif_rate(x.基站名_x, x.基站名_y), axis=1)
result = result.drop(labels=["nearest_point"], axis=1)
result.to_excel(r'D:\xxx经纬度信息表-结果-范围内的所有点位.xlsx', index=None)
if __name__ == '__main__':
path_in = r'D:\excel_template_站点搜索.xlsx'
run_kd_tree()
run_rd_tree()