本文为使用pyhdfs包向hdfs中上传文件。主要用来测试hdfs当前服务的状态。同时,脚本支持namenode ha的方式。
pip3 install hdfs
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# 本脚本的作用:每五秒钟创建一个1MB大小的文件,已年月日时分秒.txt作为文件名称。并上传至hdfs的示例。
import os
import time
from datetime import datetime
from pyhdfs import HdfsClient
# HDFS配置信息
hdfs_user = 'hdfs'
nn1="node01:50070"
nn2="node03:50070"
client = HdfsClient(hosts=[nn1,nn2],user_name=hdfs_user)
def create_and_upload_file():
try:
# 生成时间戳
current_time = datetime.now().strftime('%Y%m%d%H%M%S')
file_name = f"{current_time}.txt"
file_path = f"/opt/apps/hdfs_tempfile/{file_name}"
# 创建 1MB 大小的文件
with open(file_path, 'wb') as file:
file.seek(1 * 1024 * 1024 - 1)
file.write(b'\0')
# 上传文件到HDFS
hdfs_path = f"/hdfs_tempfile/{file_name}"
client.copy_from_local(file_path, hdfs_path, overwrite=True)
# 删除本地文件
os.remove(file_path)
print(f"File {file_name} uploaded to HDFS successfully.")
except Exception as e:
print(f"Error: {e}")
# 每隔5秒钟创建并上传文件
try:
while True:
create_and_upload_file()
time.sleep(5)
except KeyboardInterrupt:
print("Script execution interrupted by user.")
python3 pyhdfs_into_hdfs.py