本文介绍使用Python的多线程技术,提高happybase模块和gdal模块的效率,从tif格式的影像文件中读取数据,并将其存储到HBase数据库中。主要步骤包括:
pip install happybase
conda install gdal
hbase-daemon.sh start thrift
就可以使用happybase模块连接到thrift服务,并操作HBase数据库了
import time
import happybase
from osgeo import gdal
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import threading
# 读取tif数据集
def readTif(fileName, xoff=0, yoff=0, data_width=0, data_height=0):
dataset = gdal.Open(fileName)
num_bands = dataset.RasterCount
# print(num_bands)
if dataset == None:
print(fileName + "文件无法打开")
# 栅格矩阵的列数
width = dataset.RasterXSize
# 栅格矩阵的行数
height = dataset.RasterYSize
# 波段数
bands = dataset.RasterCount
# 获取数据
if (data_width == 0 and data_height == 0):
data_width = width
data_height = height
data = dataset.ReadAsArray(xoff, yoff, data_width, data_height)
# 获取仿射矩阵信息
geotrans = dataset.GetGeoTransform()
# 获取投影信息
proj = dataset.GetProjection()
return width, height, bands, data, geotrans, proj
# 分块影像所在文件夹,不能有中文
tifDir = r"E:\pyimg\tif2csv\S2SR10mallband3tile"
tifs = [i for i in os.listdir(tifDir) if i.endswith(".tif")]
print("有 %s 个tif文件" % len(tifs))
# 获取目标文件数量,前缀相同的
bandlist=['B2','B3','B4','B5','B6','B7','B8','B8A','B11','B12']
datelist1 = []
fenkuailist1 = []
for i in tifs:
datelist1.append(i[:-26])
fenkuailist1.append(i[-25:-4])
datelist = list(set(datelist1))
datelist.sort(key=datelist1.index)
fenkuailist = list(set(fenkuailist1))
fenkuailist.sort(key=fenkuailist1.index)
print("有 %s 个日期" % len(datelist))
print("datelist" , datelist)
print("每个日期 %s 个块" % len(fenkuailist))
print("fenkuailist" , fenkuailist)
connection = happybase.Connection('192.168.1.100')
# # before first use:
connection.open()
table = connection.table('rawdata')
定义一个函数load,用于读取和写入一个分块的数据。该函数接受分块编号、分块列表和日期列表作为参数。该函数的主要步骤如下:
def load(kuai,fenkuailist,datelist):
connection = happybase.Connection('192.168.1.100')
# # before first use:
connection.open()
table = connection.table('rawdata')
print("(%d/%d)块编号:"%(kuai+1,len(fenkuailist)),fenkuailist[kuai])
# 初始化立方体
img_file = tifDir + "\\" + datelist[0] + "-" + fenkuailist[kuai] + ".tif"
im_width, im_height, im_bands, im_data, kuai_im_geotrans, kuai_im_proj = readTif(img_file)
tmpttt = np.empty((im_bands, im_width * im_height, len(datelist)))
# print("波段 %s 个" % im_bands)
# print("行列数", im_width, im_height)
for shijian in range(len(datelist)):
# 图像
img_file = tifDir + "\\" + datelist[shijian] + "-" + fenkuailist[kuai] + ".tif"
# print(img_file)
im_width, im_height, im_bands, im_data, im_geotrans, im_proj = readTif(img_file)
kuai_im_geotrans = im_geotrans
kuai_im_proj=im_proj
for j in range(im_bands):
tmpttt[j, :, shijian] = im_data[j].flatten(order='C')
print("写入中...")
for index in tqdm(range(im_width * im_height)):
dt={}
for ban in range(im_bands):
d1=zip(map(lambda x:"f1:"+x+bandlist[ban],datelist),tmpttt[ban, index, :].astype(str))
# Converting zip object to dict using dict() contructor.
dt.update(d1)
key=str(kuai%3)+fenkuailist[kuai][6:10]+fenkuailist[kuai][-4:]+str(index)
table.put(key, dt) # 提交数据,0001代表行键,写入的数据要使用字典形式表示
connection.close() # 关闭传输
# #计时开始
start = time.perf_counter()
sem=threading.Semaphore(5) #限制线程的最大数量为5
for kuai in range(30,72):
sem.acquire()
threading.Thread(target = load, args = (kuai,fenkuailist,datelist)).start()
def load(kuai,fenkuailist,datelist):
connection = happybase.Connection('192.168.1.100')
# # before first use:
connection.open()
table = connection.table('rawdata')
print("(%d/%d)块编号:"%(kuai+1,len(fenkuailist)),fenkuailist[kuai])
# 初始化立方体
img_file = tifDir + "\\" + datelist[0] + "-" + fenkuailist[kuai] + ".tif"
im_width, im_height, im_bands, im_data, kuai_im_geotrans, kuai_im_proj = readTif(img_file)
tmpttt = np.empty((im_bands, im_width * im_height, len(datelist)))
# print("波段 %s 个" % im_bands)
# print("行列数", im_width, im_height)
for shijian in range(len(datelist)):
# 图像
img_file = tifDir + "\\" + datelist[shijian] + "-" + fenkuailist[kuai] + ".tif"
# print(img_file)
im_width, im_height, im_bands, im_data, im_geotrans, im_proj = readTif(img_file)
kuai_im_geotrans = im_geotrans
kuai_im_proj=im_proj
for j in range(im_bands):
tmpttt[j, :, shijian] = im_data[j].flatten(order='C')
print("写入中...")
for index in tqdm(range(im_width * im_height)):
dt={}
for ban in range(im_bands):
d1=zip(map(lambda x:"f1:"+x+bandlist[ban],datelist),tmpttt[ban, index, :].astype(str))
# Converting zip object to dict using dict() contructor.
dt.update(d1)
key=str(kuai%3)+fenkuailist[kuai][6:10]+fenkuailist[kuai][-4:]+str(index)
table.put(key, dt) # 提交数据,0001代表行键,写入的数据要使用字典形式表示
connection.close() # 关闭传输
sem.release()
# #计时结束
delta = time.perf_counter()-start
print("程序运行的时间是:{}秒".format(delta))