学习目标:
1.了解和学习shapely和geopandas的基本功能,掌握用python中的这两个库实现几何对象之间的空间操作方法。
2.掌握folium和kepler.gl的数据可视化工具的使用。
3.学习与掌握geohash编码方法。
导入库:
from shapely import geometry as geo
from shapely import wkt
from shapely import ops
import numpy as np
LineStrings
arr=np.array([(0,0), (1,1), (1,0)])
line = geo.LineString(arr) #等同于 line = geo.LineString([(0,0), (1,1), (1,0)])
print ('两个几何对象之间的距离:'+str(geo.Point(2,2).distance(line)))#该方法即可求线线距离也可以求线点距离
print ('两个几何对象之间的hausdorff_distance距离:'+str(geo.Point(2,2).hausdorff_distance(line)))#该方法求得是点与线的最长距离
print('该几何对象的面积:'+str(line.area))
print('该几何对象的坐标范围:'+str(line.bounds))
print('该几何对象的长度:'+str(line.length))
print('该几何对象的几何类型:'+str(line.geom_type))
print('该几何对象的坐标系:'+str(list(line.coords)))
center = line.centroid #几何中心
geo.GeometryCollection([line,center])
bbox = line.envelope #envelope可以求几何对象的最小外接矩形
geo.GeometryCollection([line,bbox])
rect = line.minimum_rotated_rectangle #最小旋转外接矩形
geo.GeometryCollection([line,rect])
pt_half = line.interpolate(0.5,normalized=True) #插值
geo.GeometryCollection([line,pt_half])
line1 = geo.LineString([(0,0),(1,-0.2),(2,0.3),(3,-0.5),(5,0.2),(7,0)])
line1_simplify = line1.simplify(0.4, preserve_topology=False) #Douglas-Pucker算法
buffer_with_circle = line1.buffer(0.2) #端点按照半圆扩展
geo.GeometryCollection([line1,buffer_with_circle])
LinearRings
# from shapely.geometry.polygon import LinearRing
ring = geo.polygon.LinearRing([(0, 0), (1, 1), (1, 0)])
print(ring.length)#相比于刚才的LineString的代码示例,其长度现在是3.41,是因为其序列是闭合的
print(ring.area)
geo.GeometryCollection([ring])
from shapely.geometry import Polygon
polygon1 = Polygon([(0, 0), (1, 1), (1, 0)])
ext = [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)]
int = [(1, 0), (0.5, 0.5), (1, 1), (1.5, 0.5), (1, 0)]
polygon2 = Polygon(ext, [int])
print(polygon1.area)
print(polygon1.length)
print(polygon2.area)#其面积是ext的面积减去int的面积
print(polygon2.length)#其长度是ext的长度加上int的长度
print(np.array(polygon2.exterior)) #外围坐标点
geo.GeometryCollection([polygon2])
几何对象关系
coords = [(0, 0), (1, 1)]
print(LineString(coords).contains(Point(0.5, 0.5)))#线与点的关系
print(LineString(coords).contains(Point(1.0, 1.0)))#因为line的边界不是属于在该对象的内部,所以返回是False
polygon1 = Polygon( [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)])
print(polygon1.contains(Point(1.0, 1.0)))#面与点的关系
#同理这个contains方法也可以扩展到面与线的关系以及面与面的关系
geo.GeometryCollection([polygon1,Point(1.0, 1.0)])
# 在下图中即为在给定6个point之后求其凸包,并绘制出来的凸包图形
points1 = geo.MultiPoint([(0, 0), (1, 1), (0, 2), (2, 2), (3, 1), (1, 0)])
hull1 = points1.convex_hull
geo.GeometryCollection([hull1,points1])
# object.intersection 返回对象与对象之间的交集
polygon1 = Polygon( [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)])
hull1.intersection(polygon1)
from shapely.geometry import asPoint,asLineString,asMultiPoint,asPolygon
import numpy as np
pa = asPoint(np.array([0.0, 0.0]))#将numpy数组转换成point格式
la = asLineString(np.array([[1.0, 2.0], [3.0, 4.0]]))#将numpy数组转换成LineString格式
ma = asMultiPoint(np.array([[1.1, 2.2], [3.3, 4.4], [5.5, 6.6]]))#将numpy数组转换成multipoint集合
pg = asPolygon(np.array([[1.1, 2.2], [3.3, 4.4], [5.5, 6.6]]))#将numpy数组转换成polygon
print(np.array(pa))#将Point转换成numpy格式
geopandas
导入库:
import pandas as pd
import geopandas
import matplotlib.pyplot as plt
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))#read_file方法可以读取shape文件,转化为GeoSeries和GeoDataFrame数据类型。
world.plot()#将GeoDataFrame变成图形展示出来,得到世界地图
plt.show()
#根据每一个polygon的pop_est不同,便可以用python绘制图表显示不同国家的人数
fig, ax = plt.subplots(figsize=(9,6),dpi = 100)
world.plot('pop_est',ax = ax,legend = True)
plt.show()
Folium
import folium
import os
#首先,创建一张指定中心坐标的地图,这里将其中心坐标设置为北京。zoom_start表示初始地图的缩放尺寸,数值越大放大程度越大
m=folium.Map(location=[39.9,116.4],zoom_start=10)
import folium
import numpy as np
from folium.plugins import HeatMap
#先手动生成data数据,该数据格式由[纬度,经度,数值]构成
data=(np.random.normal(size=(100,3))*np.array([[1,1,1]])+np.array([[48,5,1]])).tolist()
Kepler.gl
import pandas as pd
import geopandas as gpd
from pyproj import Proj
from keplergl import KeplerGl
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import shapely
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimSun'] # 指定默认字体为新宋体。
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像时 负号'-' 显示为方块和报错的问题。
#获取文件夹中的数据
def get_data(file_path,model):
assert model in ['train', 'test'], '{} Not Support this type of file'.format(model)
paths = os.listdir(file_path)
# print(len(paths))
tmp = []
for t in tqdm(range(len(paths))):
p = paths[t]
with open('{}/{}'.format(file_path, p), encoding='utf-8') as f:
next(f)
for line in f.readlines():
tmp.append(line.strip().split(','))
tmp_df = pd.DataFrame(tmp)
if model == 'train':
tmp_df.columns = ['ID', 'lat', 'lon', 'speed', 'direction', 'time', 'type']
else:
tmp_df['type'] = 'unknown'
tmp_df.columns = ['ID', 'lat', 'lon', 'speed', 'direction', 'time', 'type']
tmp_df['lat'] = tmp_df['lat'].astype(float)
tmp_df['lon'] = tmp_df['lon'].astype(float)
tmp_df['speed'] = tmp_df['speed'].astype(float)
tmp_df['direction'] = tmp_df['direction'].astype(int)#如果该行代码运行失败,请尝试更新pandas的版本
return tmp_df
# 平面坐标转经纬度,供初赛数据使用
# 选择标准为NAD83 / California zone 6 (ftUS) (EPSG:2230),查询链接:https://mygeodata.cloud/cs2cs/
def transform_xy2lonlat(df):
x = df['lat'].values
y = df['lon'].values
p=Proj('+proj=lcc +lat_1=33.88333333333333 +lat_2=32.78333333333333 +lat_0=32.16666666666666 +lon_0=-116.25 +x_0=2000000.0001016 +y_0=500000.0001016001 +datum=NAD83 +units=us-ft +no_defs ')
df['lon'], df['lat'] = p(y, x, inverse=True)
return df
#修改数据的时间格式
def reformat_strtime(time_str=None, START_YEAR="2019"):
"""Reformat the strtime with the form '08 14' to 'START_YEAR-08-14' """
time_str_split = time_str.split(" ")
time_str_reformat = START_YEAR + "-" + time_str_split[0][:2] + "-" + time_str_split[0][2:4]
time_str_reformat = time_str_reformat + " " + time_str_split[1]
# time_reformat=datetime.strptime(time_str_reformat,'%Y-%m-%d %H:%M:%S')
return time_str_reformat
#计算两个点的距离
def haversine_np(lon1, lat1, lon2, lat2):
lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
c = 2 * np.arcsin(np.sqrt(a))
km = 6367 * c
return km * 1000
def compute_traj_diff_time_distance(traj=None):
"""Compute the sampling time and the coordinate distance."""
# 计算时间的差值
time_diff_array = (traj["time"].iloc[1:].reset_index(drop=True) - traj[
"time"].iloc[:-1].reset_index(drop=True)).dt.total_seconds() / 60
# 计算坐标之间的距离
dist_diff_array = haversine_np(traj["lon"].values[1:], # lon_0
traj["lat"].values[1:], # lat_0
traj["lon"].values[:-1], # lon_1
traj["lat"].values[:-1] # lat_1
)
# 填充第一个值
time_diff_array = [time_diff_array.mean()] + time_diff_array.tolist()
dist_diff_array = [dist_diff_array.mean()] + dist_diff_array.tolist()
traj.loc[list(traj.index),'time_array'] = time_diff_array
traj.loc[list(traj.index),'dist_array'] = dist_diff_array
return traj
#对轨迹进行异常点的剔除
def assign_traj_anomaly_points_nan(traj=None, speed_maximum=23,
time_interval_maximum=200,
coord_speed_maximum=700):
"""Assign the anomaly points in traj to np.nan."""
def thigma_data(data_y,n):
data_x =[i for i in range(len(data_y))]
ymean = np.mean(data_y)
ystd = np.std(data_y)
threshold1 = ymean - n * ystd
threshold2 = ymean + n * ystd
judge=[]
for data in data_y:
if (data < threshold1)|(data> threshold2):
judge.append(True)
else:
judge.append(False)
return judge
# Step 1: The speed anomaly repairing
is_speed_anomaly = (traj["speed"] > speed_maximum) | (traj["speed"] < 0)
traj["speed"][is_speed_anomaly] = np.nan
# Step 2: 根据距离和时间计算速度
is_anomaly = np.array([False] * len(traj))
traj["coord_speed"] = traj["dist_array"] / traj["time_array"]
# Condition 1: 根据3-sigma算法剔除coord speed以及较大时间间隔的点
is_anomaly_tmp = pd.Series(thigma_data(traj["time_array"],3)) | pd.Series(thigma_data(traj["coord_speed"],3))
is_anomaly = is_anomaly | is_anomaly_tmp
is_anomaly.index=traj.index
# Condition 2: 轨迹点的3-sigma异常处理
traj = traj[~is_anomaly].reset_index(drop=True)
is_anomaly = np.array([False] * len(traj))
if len(traj) != 0:
lon_std, lon_mean = traj["lon"].std(), traj["lon"].mean()
lat_std, lat_mean = traj["lat"].std(), traj["lat"].mean()
lon_low, lon_high = lon_mean - 3 * lon_std, lon_mean + 3 * lon_std
lat_low, lat_high = lat_mean - 3 * lat_std, lat_mean + 3 * lat_std
is_anomaly = is_anomaly | (traj["lon"] > lon_high) | ((traj["lon"] < lon_low))
is_anomaly = is_anomaly | (traj["lat"] > lat_high) | ((traj["lat"] < lat_low))
traj = traj[~is_anomaly].reset_index(drop=True)
return traj, [len(is_speed_anomaly) - len(traj)]