sftp_file.seek()
和 file.seek()
确保正确的块被下载os.cpu_count()
获取当前系统的CPU数量,并将文件分割成块,数量与CPU个数一致。我们创建一个Process
实例,每个实例负责一个文件块的下载。我们再次使用process.join()
等待所有进程完成,在完成后输出一条完成信息。Process
方法来并发地调用分片下载函数。通过确定每个进程所负责的起始字节和结束字节,从而实现对服务器端文件的分片下载。然后,我们关闭并等待进程池中的所有任务完成。import paramiko
import multiprocessing
import time
def download_chunk_file(start_pos, end_pos, remote_path, local_path, ssh_info):
print("download_chunk_file start")
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(**ssh_info)
sftp = client.open_sftp()
# open both local and remote file
local_file = open(local_path, "r+b")
remote_file = sftp.open(remote_path, "rb")
# right shift to same start position
local_file.seek(start_pos)
remote_file.seek(start_pos)
#print("start_pos ~ end_pos: {} ~ {}".format(start_pos, end_pos))
while True:
# read chunk file from remote file
buffer = remote_file.read(end_pos - start_pos)
if not buffer:
break
# write chunk file to local file
local_file.write(buffer)
print("chunk file with start_pos ~ end_pos: {} ~ {}, Download successfully!".format(start_pos, end_pos))
remote_file.close()
local_file.close()
client.close()
print("download_chunk_file end")
def main():
print("main start")
host = "host"
port = 22
username = "username"
password = "password"
remote_path = '/remote_dir/remote_file'
local_path = '/local_dir/local_file'
ssh_info = {
"hostname": host,
"port": port,
"username": username,
"password": password,
}
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(**ssh_info)
sftp = client.open_sftp()
file_size = sftp.stat(remote_path).st_size
sftp.close()
client.close()
# get number of CPU with high efficiency
processes = multiprocessing.cpu_count()
#processes = 1
print("number of CPU is {}".format(processes))
# calculate the chunk size, n-1 processing handle each chunk size sub-file, and last processing handle last remaining sub-file
chunk_size = file_size // processes
with open(local_path, "wb") as f:
f.truncate(file_size)
multiprocess_download_start = time.time()
process_list = []
for i in range(processes):
#print("process: {}".format(i))
start_pos = i * chunk_size
end_pos = (i + 1) * chunk_size if (i + 1) * chunk_size < file_size else file_size
# multi processing to function download_chunk_file
p = multiprocessing.Process(target=download_chunk_file, args=(start_pos, end_pos, remote_path, local_path, ssh_info))
p.start()
print(p)
process_list.append(p)
# wait for all the processes to finish
for p in process_list:
p.join()
print(p)
multiprocess_download_end = time.time()
multiprocess_download_cost = multiprocess_download_end - multiprocess_download_start
print("Full file Download successfully! Cost: {:.2f}s".format(multiprocess_download_cost))
print("main end")
if __name__ == "__main__":
main()
import paramiko
import multiprocessing
import time
def get_remote_file_size(ssh_info, remote_path):
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(**ssh_info)
sftp = ssh.open_sftp()
# get remote file size
remote_file_size = sftp.stat(remote_path).st_size
print ("remote_file_size:{}".format(remote_file_size))
sftp.close()
ssh.close()
return remote_file_size
def download_chunk_file(ssh_info, remote_path, local_path, start_pos, end_pos):
print("download_chunk_file start")
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(**ssh_info)
sftp = client.open_sftp()
# open both local and remote file
local_file = open(local_path, "r+b")
remote_file = sftp.open(remote_path, "rb")
# right shift to same start position
local_file.seek(start_pos)
remote_file.seek(start_pos)
# print("start_pos ~ end_pos: {} ~ {}".format(start_pos, end_pos))
while True:
# read chunk file from remote file
read_start = time.time()
buffer = remote_file.read(end_pos - start_pos)
if not buffer:
break
else:
print("read cost time {:.2f}s".format(time.time() - read_start))
# write chunk file to local file
write_start = time.time()
local_file.write(buffer)
print("write cost time {:.2f}s".format(time.time() - write_start))
print("chunk file with start_pos ~ end_pos: {} ~ {}, Download successfully!".format(start_pos, end_pos))
remote_file.close()
local_file.close()
client.close()
print("download_chunk_file end")
def download_multiprocessing(ssh_info, remote_path, local_path):
# get number of CPU with high efficiency
num_processes = multiprocessing.cpu_count()
#num_processes = 1
print("number of CPU is {}, number of process is {}".format(multiprocessing.cpu_count(), num_processes))
# get remote file size
file_size = get_remote_file_size(ssh_info, remote_path)
# create new empty local file, same size with remote file
with open(local_path, "wb") as f:
f.truncate(file_size)
# calculate the chunk size, n-1 processing handle each chunk size sub-file, and last processing handle last remaining sub-file
chunk_size = file_size // num_processes
print("chunk_size is {}".format(chunk_size))
# create number of process
processes = []
# create a process for each chunk
for index in range(num_processes):
#print("process: {}".format(index))
start_pos = index * chunk_size
end_pos = start_pos + chunk_size
# last process will download the remaining bytes
if index == num_processes - 1:
end_pos = file_size - 1
args = (ssh_info, remote_path, local_path, start_pos, end_pos)
process = multiprocessing.Process(target=download_chunk_file, args=args)
process.start()
print(process)
processes.append(process)
# wait for all the processes to finish
for process in processes:
process.join()
print(process)
def main():
host = "host"
port = 22
username = "username"
password = "password"
remote_path = '/remote_dir/remote_file'
local_path = '/local_dir/local_file'
ssh_info = {
"hostname": host,
"port": port,
"username": username,
"password": password,
}
multiprocess_download_start = time.time()
download_multiprocessing(ssh_info, remote_path, local_path)
multiprocess_download_end = time.time()
multiprocess_download_cost = multiprocess_download_end - multiprocess_download_start
print("Full file Download successfully! Cost time: {:.2f}s".format(multiprocess_download_cost))
if __name__ == "__main__":
main()
$ python multi_process_download_single_bigfile_def.py
number of CPU is 4
remote_file_size:63376366
chunk_size is 15844091
download_chunk_file start
download_chunk_file start
download_chunk_file start
download_chunk_file start
read cost time 6.19s
write cost time 0.01s
read cost time 6.22s
write cost time 0.01s
read cost time 6.20s
write cost time 0.01s
read cost time 0.00s
write cost time 0.00s
read cost time 0.00s
chunk file with start_pos ~ end_pos: 47532273 ~ 63376365, Download successfully!
download_chunk_file end
read cost time 6.24s
write cost time 0.01s
read cost time 4.25s
write cost time 0.01s
read cost time 4.36s
write cost time 0.01s
read cost time 4.34s
write cost time 0.01s
read cost time 0.03s
write cost time 0.00s
read cost time 0.00s
chunk file with start_pos ~ end_pos: 31688182 ~ 47532273, Download successfully!
download_chunk_file end
read cost time 4.26s
write cost time 0.01s
read cost time 0.00s
write cost time 0.00s
read cost time 0.00s
chunk file with start_pos ~ end_pos: 15844091 ~ 31688182, Download successfully!
download_chunk_file end
read cost time 4.39s
write cost time 0.01s
read cost time 4.29s
write cost time 0.01s
read cost time 0.00s
write cost time 0.00s
read cost time 0.00s
chunk file with start_pos ~ end_pos: 0 ~ 15844091, Download successfully!
download_chunk_file end
Full file Download successfully! Cost: 19.62s
参考:
Python paramiko文件传输显示上传下载进度信息 - print_Entropy-Go的博客-CSDN博客
Python paramiko实现文件的简单传输上传和下载代码_Entropy-Go的博客-CSDN博客
单进程处理时,建议直接使用getfo()函数下载,实测下载速度比read(), write()方法快很多。