智慧海洋建设-Task1 地理数据分析常用工具

学习目标：

1.了解和学习shapely和geopandas的基本功能，掌握用python中的这两个库实现几何对象之间的空间操作方法。

2.掌握folium和kepler.gl的数据可视化工具的使用。

3.学习与掌握geohash编码方法。

导入库：

from shapely import geometry as geo

from shapely import wkt

from shapely import ops

import numpy as np

LineStrings
arr=np.array([(0,0), (1,1), (1,0)])

line = geo.LineString(arr) #等同于 line = geo.LineString([(0,0), (1,1), (1,0)])

print ('两个几何对象之间的距离:'+str(geo.Point(2,2).distance(line)))#该方法即可求线线距离也可以求线点距离

print ('两个几何对象之间的hausdorff_distance距离:'+str(geo.Point(2,2).hausdorff_distance(line)))#该方法求得是点与线的最长距离

print('该几何对象的面积:'+str(line.area))

print('该几何对象的坐标范围:'+str(line.bounds))

print('该几何对象的长度:'+str(line.length))

print('该几何对象的几何类型:'+str(line.geom_type))

print('该几何对象的坐标系:'+str(list(line.coords)))

center = line.centroid #几何中心

geo.GeometryCollection([line,center])

bbox = line.envelope #envelope可以求几何对象的最小外接矩形

geo.GeometryCollection([line,bbox])

rect = line.minimum_rotated_rectangle #最小旋转外接矩形

geo.GeometryCollection([line,rect])

pt_half = line.interpolate(0.5,normalized=True) #插值

geo.GeometryCollection([line,pt_half])

line1 = geo.LineString([(0,0),(1,-0.2),(2,0.3),(3,-0.5),(5,0.2),(7,0)])

line1_simplify = line1.simplify(0.4, preserve_topology=False) #Douglas-Pucker算法

buffer_with_circle = line1.buffer(0.2) #端点按照半圆扩展

geo.GeometryCollection([line1,buffer_with_circle])

LinearRings

# from shapely.geometry.polygon import LinearRing

ring = geo.polygon.LinearRing([(0, 0), (1, 1), (1, 0)])

print(ring.length)#相比于刚才的LineString的代码示例，其长度现在是3.41，是因为其序列是闭合的

print(ring.area)

geo.GeometryCollection([ring])

from shapely.geometry import Polygon

polygon1 = Polygon([(0, 0), (1, 1), (1, 0)])

ext = [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)]

int = [(1, 0), (0.5, 0.5), (1, 1), (1.5, 0.5), (1, 0)]

polygon2 = Polygon(ext, [int])

print(polygon1.area)

print(polygon1.length)

print(polygon2.area)#其面积是ext的面积减去int的面积

print(polygon2.length)#其长度是ext的长度加上int的长度

print(np.array(polygon2.exterior)) #外围坐标点

geo.GeometryCollection([polygon2])

几何对象关系

coords = [(0, 0), (1, 1)]

print(LineString(coords).contains(Point(0.5, 0.5)))#线与点的关系

print(LineString(coords).contains(Point(1.0, 1.0)))#因为line的边界不是属于在该对象的内部，所以返回是False

polygon1 = Polygon( [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)])

print(polygon1.contains(Point(1.0, 1.0)))#面与点的关系

#同理这个contains方法也可以扩展到面与线的关系以及面与面的关系

geo.GeometryCollection([polygon1,Point(1.0, 1.0)])

# 在下图中即为在给定6个point之后求其凸包，并绘制出来的凸包图形

points1 = geo.MultiPoint([(0, 0), (1, 1), (0, 2), (2, 2), (3, 1), (1, 0)])

hull1 = points1.convex_hull

geo.GeometryCollection([hull1,points1])

# object.intersection 返回对象与对象之间的交集

polygon1 = Polygon( [(0, 0), (0, 2), (2, 2), (2, 0), (0, 0)])

hull1.intersection(polygon1)

from shapely.geometry import asPoint,asLineString,asMultiPoint,asPolygon

import numpy as np

pa = asPoint(np.array([0.0, 0.0]))#将numpy数组转换成point格式

la = asLineString(np.array([[1.0, 2.0], [3.0, 4.0]]))#将numpy数组转换成LineString格式

ma = asMultiPoint(np.array([[1.1, 2.2], [3.3, 4.4], [5.5, 6.6]]))#将numpy数组转换成multipoint集合

pg = asPolygon(np.array([[1.1, 2.2], [3.3, 4.4], [5.5, 6.6]]))#将numpy数组转换成polygon

print(np.array(pa))#将Point转换成numpy格式

geopandas

导入库：

import pandas as pd

import geopandas

import matplotlib.pyplot as plt

world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))#read_file方法可以读取shape文件，转化为GeoSeries和GeoDataFrame数据类型。

world.plot()#将GeoDataFrame变成图形展示出来，得到世界地图

plt.show()

#根据每一个polygon的pop_est不同，便可以用python绘制图表显示不同国家的人数

fig, ax = plt.subplots(figsize=(9,6),dpi = 100)

world.plot('pop_est',ax = ax,legend = True)

plt.show()

Folium

import folium

import os

#首先，创建一张指定中心坐标的地图，这里将其中心坐标设置为北京。zoom_start表示初始地图的缩放尺寸，数值越大放大程度越大

m=folium.Map(location=[39.9,116.4],zoom_start=10)

import folium

import numpy as np

from folium.plugins import HeatMap

#先手动生成data数据，该数据格式由[纬度，经度，数值]构成

data=(np.random.normal(size=(100,3))*np.array([[1,1,1]])+np.array([[48,5,1]])).tolist()

Kepler.gl

import pandas as pd

import geopandas as gpd

from pyproj import Proj

from keplergl import KeplerGl

from tqdm import tqdm

import os

import matplotlib.pyplot as plt

import shapely

import numpy as np

from datetime import datetime

import warnings

warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif'] = ['SimSun'] # 指定默认字体为新宋体。

plt.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-' 显示为方块和报错的问题。

#获取文件夹中的数据

def get_data(file_path,model):

assert model in ['train', 'test'], '{} Not Support this type of file'.format(model)

paths = os.listdir(file_path)

# print(len(paths))

tmp = []

for t in tqdm(range(len(paths))):

p = paths[t]

with open('{}/{}'.format(file_path, p), encoding='utf-8') as f:

next(f)

for line in f.readlines():

tmp.append(line.strip().split(','))

tmp_df = pd.DataFrame(tmp)

if model == 'train':

tmp_df.columns = ['ID', 'lat', 'lon', 'speed', 'direction', 'time', 'type']

else:

tmp_df['type'] = 'unknown'

tmp_df.columns = ['ID', 'lat', 'lon', 'speed', 'direction', 'time', 'type']

tmp_df['lat'] = tmp_df['lat'].astype(float)

tmp_df['lon'] = tmp_df['lon'].astype(float)

tmp_df['speed'] = tmp_df['speed'].astype(float)

tmp_df['direction'] = tmp_df['direction'].astype(int)#如果该行代码运行失败，请尝试更新pandas的版本

return tmp_df

# 平面坐标转经纬度，供初赛数据使用

# 选择标准为NAD83 / California zone 6 (ftUS) (EPSG:2230)，查询链接：https://mygeodata.cloud/cs2cs/

def transform_xy2lonlat(df):

x = df['lat'].values

y = df['lon'].values

p=Proj('+proj=lcc +lat_1=33.88333333333333 +lat_2=32.78333333333333 +lat_0=32.16666666666666 +lon_0=-116.25 +x_0=2000000.0001016 +y_0=500000.0001016001 +datum=NAD83 +units=us-ft +no_defs ')

df['lon'], df['lat'] = p(y, x, inverse=True)

return df

#修改数据的时间格式

def reformat_strtime(time_str=None, START_YEAR="2019"):

"""Reformat the strtime with the form '08 14' to 'START_YEAR-08-14' """

time_str_split = time_str.split(" ")

time_str_reformat = START_YEAR + "-" + time_str_split[0][:2] + "-" + time_str_split[0][2:4]

time_str_reformat = time_str_reformat + " " + time_str_split[1]

# time_reformat=datetime.strptime(time_str_reformat,'%Y-%m-%d %H:%M:%S')

return time_str_reformat

#计算两个点的距离

def haversine_np(lon1, lat1, lon2, lat2):

lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

dlon = lon2 - lon1

dlat = lat2 - lat1

a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

c = 2 * np.arcsin(np.sqrt(a))

km = 6367 * c

return km * 1000

def compute_traj_diff_time_distance(traj=None):

"""Compute the sampling time and the coordinate distance."""

# 计算时间的差值

time_diff_array = (traj["time"].iloc[1:].reset_index(drop=True) - traj[

"time"].iloc[:-1].reset_index(drop=True)).dt.total_seconds() / 60

# 计算坐标之间的距离

dist_diff_array = haversine_np(traj["lon"].values[1:], # lon_0

traj["lat"].values[1:], # lat_0

traj["lon"].values[:-1], # lon_1

traj["lat"].values[:-1] # lat_1

)

# 填充第一个值

time_diff_array = [time_diff_array.mean()] + time_diff_array.tolist()

dist_diff_array = [dist_diff_array.mean()] + dist_diff_array.tolist()

traj.loc[list(traj.index),'time_array'] = time_diff_array

traj.loc[list(traj.index),'dist_array'] = dist_diff_array

return traj

#对轨迹进行异常点的剔除

def assign_traj_anomaly_points_nan(traj=None, speed_maximum=23,

time_interval_maximum=200,

coord_speed_maximum=700):

"""Assign the anomaly points in traj to np.nan."""

def thigma_data(data_y,n):

data_x =[i for i in range(len(data_y))]

ymean = np.mean(data_y)

ystd = np.std(data_y)

threshold1 = ymean - n * ystd

threshold2 = ymean + n * ystd

judge=[]

for data in data_y:

if (data < threshold1)|(data> threshold2):

judge.append(True)

else:

judge.append(False)

return judge

# Step 1: The speed anomaly repairing

is_speed_anomaly = (traj["speed"] > speed_maximum) | (traj["speed"] < 0)

traj["speed"][is_speed_anomaly] = np.nan

# Step 2: 根据距离和时间计算速度

is_anomaly = np.array([False] * len(traj))

traj["coord_speed"] = traj["dist_array"] / traj["time_array"]

# Condition 1: 根据3-sigma算法剔除coord speed以及较大时间间隔的点

is_anomaly_tmp = pd.Series(thigma_data(traj["time_array"],3)) | pd.Series(thigma_data(traj["coord_speed"],3))

is_anomaly = is_anomaly | is_anomaly_tmp

is_anomaly.index=traj.index

# Condition 2: 轨迹点的3-sigma异常处理

traj = traj[~is_anomaly].reset_index(drop=True)

is_anomaly = np.array([False] * len(traj))

if len(traj) != 0:

lon_std, lon_mean = traj["lon"].std(), traj["lon"].mean()

lat_std, lat_mean = traj["lat"].std(), traj["lat"].mean()

lon_low, lon_high = lon_mean - 3 * lon_std, lon_mean + 3 * lon_std

lat_low, lat_high = lat_mean - 3 * lat_std, lat_mean + 3 * lat_std

is_anomaly = is_anomaly | (traj["lon"] > lon_high) | ((traj["lon"] < lon_low))

is_anomaly = is_anomaly | (traj["lat"] > lat_high) | ((traj["lat"] < lat_low))

traj = traj[~is_anomaly].reset_index(drop=True)

return traj, [len(is_speed_anomaly) - len(traj)]

智慧海洋建设-Task1 地理数据分析常用工具

你可能感兴趣的:(智慧海洋建设-Task1 地理数据分析常用工具)