python环境下支持好几种与hdfs文件同步接口,如:pyhdfs,hdfs,libhdfs,pyarrow,shell等。考虑到易用性以及本地环境(公司集群禁掉了http服务),本文介绍使用pyarrow 以一种优雅的方式进行本地文件与hdfs同步。
在线安装
pip install pyarrow
离线安装
pyarrow-官网下载
下载完.whl
文件后,通过 pip instal pyarrow-9.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# 这里根据实际情况配置,在linux环境下
export PATH=$JAVA_HOME/bin:$HBASE_HOME/bin:$HADOOP_HOME/bin:$HIVE_HOME/bin:$SPARK_HOME/bin:$SQOOP_HOME/bin:$PATH:${HOME}/.local/bin:$ {DISTCP_HOME}:$JAVA_HOME/jre/bin:$PATH
export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
export HADOOP_LIBEXEC_DIR=${HADOOP_HOME}/libexec
参考官网:
export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
# or on Windows
%HADOOP_HOME%/bin/hadoop classpath --glob > %CLASSPATH%
HdfsPath
,LocalPath
分别为自定义的类,主要是实现fs.HadoopFileSystem(host="***", port=, user="***", kerb_ticket=None)
,fs.LocalFileSystem
对象,
下面2.1,2.3功能 示例较完整。简单的测试实例见2.3.
def hdfs_to_local(hdfs_path: HdfsPath, local_path: LocalPath):
"""
同步文件,这里有两种情况:
1)若是单个文件,可以直接打开用文件流的方式写进入;
2)若是目录,首先创建文件选择器,进而获取文件目录下的所有文件依次按照 法1) 方式传输。
:param hdfs_path:
:param local_path:
:return:
"""
hdfs = hdfs_path.get_hdfs()
local = local_path.get_local()
_logger.info(f"复制hdfs文件 {hdfs_path.path} 到本地{local_path.path}.")
# hdfs 本地文件做规则化处理
if hdfs_path.path.endswith('/'):
hdfs_path.path = hdfs_path.path[:-1]
if local_path.path.endswith('/'):
local_path.path = local_path.path[:-1]
len_hdfs_path = len(str(Path(hdfs_path.path).absolute().parent))
file_list = []
# 如果是文件,按照父目录传递
if hdfs_path.isFile():
file_list.append(hdfs.get_file_info(hdfs_path.path))
elif hdfs_path.isDir():
file_select = fs.FileSelector(hdfs_path.path, allow_not_found=True, recursive=True)
file_list = hdfs.get_file_info(file_select)
_logger.info(f"共有{len(file_list)}个文件需要复制...")
for file in file_list:
local_filename = local_path.path + file.path[len_hdfs_path:]
# 判断是否为文件、文件夹
if file.type.value == 3:
local.create_dir(local_filename)
elif file.type.value == 2:
with hdfs.open_input_stream(file.path) as read_file, local.open_append_stream(
local_filename) as write_file:
write_file.write(read_file.readall())
else:
print("hdfs_file -> local_file 文件下载异常!")
raise Exception(f"\n{file} 文件为空。\n")
def local_to_hdfs(local_path: LocalPath, hdfs_path: HdfsPath):
"""
同上述方法类似
:param local_path:
:param hdfs_path:
:return:
"""
_logger.info(f"复制本地 文件 {local_path.path} 到hdfs {hdfs_path.path}.")
local = local_path.get_local()
hdfs = hdfs_path.get_hdfs()
# hdfs 本地文件做规则化处理
if hdfs_path.path.endswith('/'):
hdfs_path.path = hdfs_path.path[:-1]
if local_path.path.endswith('/'):
local_path.path = local_path.path[:-1]
len_local_path = len(str(Path(local_path.path).absolute().parent))
file_list = []
# 如果是文件,按照父目录传递
if local_path.isFile():
file_list.append(local.get_file_info(local_path.path))
elif local_path.isDir():
file_select = fs.FileSelector(local_path.path, allow_not_found=True, recursive=True)
file_list = local.get_file_info(file_select)
_logger.info(f"共有{len(file_list)}个文件需要复制...")
for file in file_list:
hdfs_filename = hdfs_path.path + file.path[len_local_path:]
if file.type.value == 3:
hdfs.create_dir(hdfs_filename)
elif file.type.value == 2:
with local.open_input_stream(file.path) as read_file, hdfs.open_output_stream(
hdfs_filename) as write_file:
write_file.write(read_file.readall())
else:
print("local_file -> hdfs_file 文件下载异常!")
raise Exception(f"\n{file} 文件为空。\n")
可用于复制单个文件。
from abc import ABCMeta
from pyarrow import fs
from pyarrow.filesystem import FileSystem
class FileOperation(FileSystem, metaclass=ABCMeta):
def __init__(self, host='localhost', port=9000, user='hdp-loan-dataai'):
self.host = host
self.port = port
self.user = user
self.hdfs = fs.HadoopFileSystem(host=host, port=port, user=user)
self.local = fs.LocalFileSystem()
def hdfs2local(self, src_path, des_path):
"""
:param src_path: hdfs 文件路径
:param des_path: 本地文件路径
:return:
"""
filename = src_path.split('/')[-1]
try:
with self.hdfs.open_input_stream(src_path) as read_file, self.local.open_append_stream(
des_path + '/' + filename) as write_file:
write_file.write(read_file.readall())
except Exception as e:
print("hdfs_file -> local_file 文件下载异常!")
def local2hdfs(self, src_path, des_path):
"""
:param src_path:
:param des_path:
:return:
"""
filename = src_path.split('/')[-1]
# self.hdfs.create_dir(des_path + '/' + filename)
with self.local.open_input_stream(src_path) as read_file, self.hdfs.open_output_stream(
des_path + '/' + filename) as write_file:
write_file.write(read_file.readall())
pyarrow-官网下载
pyarrow API官网