#多进程运行程序
import threadpool
import itertools
import paramiko
import time
def splitDict(pingo_table_dict):
pingo_table_list = []
pingo_table_iter = iter(pingo_table_dict.items())
#切割字典5次
for i in range(5):
#每次切割10条字典数据出来
pingo_table = dict(itertools.islice(pingo_table_iter,10))
pingo_table_list.append(pingo_table)
return pingo_table_list
def exec_ssh_command(ssh_command,ssh):
# 创建一个ssh对象
# 执行shell命令,返回的是一个元组
# ls /opt/bi/kettle/etljobs_svn/lens_olap/
stdin, stdout, stderr = ssh.exec_command(ssh_command)
# 返回shell命令执行结果
# for i in stdout.readlines():
# print(i)
#
##获取输出结果,decode('utf-8')解码是为了存在中文能够正常显示
result = stdout.read().decode('utf-8')
return result
def del_value(result):
if result.startswith('0'):
atr = '0 G'
return atr
elif result == "":
atr = '0 G'
return atr
elif 'hdfs' in result:
atr = result.split(" hdfs:")[0]
return atr
elif 'pfs' in result:
atr = result.split(" /pfs")[0]
return atr
else:
return "其他结果情形"
def search_table_space(new_pingo_dict):
# ps -ef 命令就是列出当前所有的进程信息
ssh = paramiko.SSHClient()
# 如果之前没有连接过的ip,会出现Are you sure you want to continue connecting (yes/no)? yes
# 自动选择yes
key = paramiko.AutoAddPolicy()
ssh.set_missing_host_key_policy(key)
# 连接服务器
ssh.connect(
hostname='10.133.0.1', port=22, username='bigdata', password='bigdata.aac', timeout=50
)
new_dict = {
}
for key,value in new_pingo_dict.items():
ssh_command = 'hdfs dfs -du -s ' + " " + value
result = exec_ssh_command(ssh_command,ssh)
last_result = del_value(result)
new_dict[key]=[value,last_result]
ssh.close()
return new_dict
#多线程回调函数,保存结果
def get_result(request,result):
global results
results.append(result)
if __name__ == "__main__":
#字典中50个元素
pingo_table_dict = {
'ods.acoustics.sfc_3080_tracedbamethyst.3080_amethyst_bk_t_testinfo_ae31': 'hdfs://bdos/minos/sqlserver/sfc_3080_tracedbamethyst/T_TestInfo_AE31/1591428980949',
......
'ods.acoustics.sfc_3080_tracedbamethyst.3080_amethyst_bk_table_id': 'hdfs://bdos/minos/sqlserver/sfc_3080_tracedbamethyst/Table_ID/1591428673333'}
#切分字典
pingo_table_list = splitDict(pingo_table_dict)
results = []
start_time = time.time()
#多线程运行方法
pool = threadpool.ThreadPool(8)
list_var0 = [pingo_table_list[0]]
list_var1 = [pingo_table_list[1]]
list_var2 = [pingo_table_list[2]]
list_var3 = [pingo_table_list[3]]
list_var4 = [pingo_table_list[4]]
par_list = [(list_var0,None),(list_var1,None),(list_var2,None),(list_var3,None),(list_var4,None)]
re = threadpool.makeRequests(search_table_space,par_list,get_result)
res = [pool.putRequest(req) for req in re]
pool.wait()
#处理多线程运行方法得到的结果
table_all_list = []
for i in results:
for key,value in i.items():
value.insert(0,key)
table_all_list.append(value)
stop_time = time.time()
print(stop_time)
#27.932997226715088
print(stop_time-start_time)
进程方法一:
import multiprocessing
import itertools
import paramiko
import time
#个 Manager 对象是一个服务进程,推荐多进程程序中,数据共享就用一个 manager 管理。
from multiprocessing import Manager
def worker(procnum, return_dict):
'''worker function'''
print(str(procnum) + ' represent!')
return_dict[procnum] = procnum
def splitDict(pingo_table_dict):
pingo_table_list = []
pingo_table_iter = iter(pingo_table_dict.items())
#切割字典9次
for i in range(5):
#每次切割5条字典数据出来
pingo_table = dict(itertools.islice(pingo_table_iter,10))
pingo_table_list.append(pingo_table)
print(len(pingo_table_list))
print(len(pingo_table_list[4]))
return pingo_table_list
def exec_ssh_command(ssh_command,ssh):
# 创建一个ssh对象
# 执行shell命令,返回的是一个元组
# ls /opt/bi/kettle/etljobs_svn/lens_olap/
stdin, stdout, stderr = ssh.exec_command(ssh_command)
# 返回shell命令执行结果
# for i in stdout.readlines():
# print(i)
#
##获取输出结果,decode('utf-8')解码是为了存在中文能够正常显示
result = stdout.read().decode('utf-8')
return result
def del_value(result):
if result.startswith('0'):
atr = 0
return atr
elif result == "":
atr = 0
return atr
elif 'hdfs' in result:
atr = result.split(" hdfs:")[0]
# value_list = []
# for i in temp:
# if i != "":
# value_list.append(i)
# else:
# break
# atr = "".join(value_list)
return int(atr)
elif 'pfs' in result:
atr = result.split(" pfs")[0]
# value_list = []
# for i in temp:
# if i !="":
# value_list.append(i)
# else:
# break
# atr = "".join(value_list)
return int(atr)
else:
return "其他结果情形"
def search_table_space(new_pingo_dict,return_dict):
# ps -ef 命令就是列出当前所有的进程信息
ssh = paramiko.SSHClient()
# 如果之前没有连接过的ip,会出现Are you sure you want to continue connecting (yes/no)? yes
# 自动选择yes
key = paramiko.AutoAddPolicy()
ssh.set_missing_host_key_policy(key)
# 连接服务器
ssh.connect(
hostname='10.133.0.1', port=22, username='bigdata', password='bigdata.aac', timeout=50
)
for key,value in new_pingo_dict.items():
ssh_command = 'hdfs dfs -du -s ' + " " + value
print(ssh_command)
result = exec_ssh_command(ssh_command,ssh)
last_result = del_value(result)
return_dict[key]=[value,last_result]
ssh.close()
if __name__ == '__main__':
#多进程运行程序
start_time = time.time()
#字典包含50个元素
pingo_table_dict = {
'ods.acoustics.sfc_3080_tracedbamethyst.3080_amethyst_bk_t_testinfo_ae31': 'hdfs://bdos/minos/sqlserver/sfc_3080_tracedbamethyst/T_TestInfo_AE31/1591428980949',
......
'ods.acoustics.sfc_3080_tracedbamethyst.3080_amethyst_bk_table_id': 'hdfs://bdos/minos/sqlserver/sfc_3080_tracedbamethyst/Table_ID/1591428673333'}
#切分字典
pingo_table_list = splitDict(pingo_table_dict)
manager = Manager()
#manager.list() 调用一次即产生一个新的数据池,而不是返回同一个数据池实例
# return_list = manager.list() 也可以使用列表list
return_dict = manager.dict()
#对进程执行程序
jobs = []
for i in range(5):
new_pingo_dict=pingo_table_list[i]
p = multiprocessing.Process(target=search_table_space, args=(new_pingo_dict,return_dict))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print(len(return_dict))
stop_tiem = time.time()
t = stop_tiem - start_time
#29.943978786468506
print(t)
进程方法二:进程池
import multiprocessing
import itertools
import paramiko
import time
#个 Manager 对象是一个服务进程,推荐多进程程序中,数据共享就用一个 manager 管理。
from multiprocessing import Manager
def worker(procnum, return_dict):
'''worker function'''
print(str(procnum) + ' represent!')
return_dict[procnum] = procnum
def splitDict(pingo_table_dict):
pingo_table_list = []
pingo_table_iter = iter(pingo_table_dict.items())
#切割字典9次
for i in range(5):
#每次切割5条字典数据出来
pingo_table = dict(itertools.islice(pingo_table_iter,10))
pingo_table_list.append(pingo_table)
print(len(pingo_table_list))
print(len(pingo_table_list[4]))
return pingo_table_list
def exec_ssh_command(ssh_command,ssh):
# 创建一个ssh对象
# 执行shell命令,返回的是一个元组
# ls /opt/bi/kettle/etljobs_svn/lens_olap/
stdin, stdout, stderr = ssh.exec_command(ssh_command)
# 返回shell命令执行结果
# for i in stdout.readlines():
# print(i)
#
##获取输出结果,decode('utf-8')解码是为了存在中文能够正常显示
result = stdout.read().decode('utf-8')
return result
def del_value(result):
if result.startswith('0'):
atr = 0
return atr
elif result == "":
atr = 0
return atr
elif 'hdfs' in result:
atr = result.split(" hdfs:")[0]
# value_list = []
# for i in temp:
# if i != "":
# value_list.append(i)
# else:
# break
# atr = "".join(value_list)
return int(atr)
elif 'pfs' in result:
atr = result.split(" pfs")[0]
# value_list = []
# for i in temp:
# if i !="":
# value_list.append(i)
# else:
# break
# atr = "".join(value_list)
return int(atr)
else:
return "其他结果情形"
def search_table_space(new_pingo_dict,return_dict):
# ps -ef 命令就是列出当前所有的进程信息
ssh = paramiko.SSHClient()
# 如果之前没有连接过的ip,会出现Are you sure you want to continue connecting (yes/no)? yes
# 自动选择yes
key = paramiko.AutoAddPolicy()
ssh.set_missing_host_key_policy(key)
# 连接服务器
ssh.connect(
hostname='10.133.0.1', port=22, username='bigdata', password='bigdata.aac', timeout=50
)
for key,value in new_pingo_dict.items():
ssh_command = 'hdfs dfs -du -s ' + " " + value
print(ssh_command)
result = exec_ssh_command(ssh_command,ssh)
last_result = del_value(result)
return_dict[key]=[value,last_result]
ssh.close()
if __name__ == '__main__':
#多进程运行程序
start_time = time.time()
pingo_table_dict = {
'ods.acoustics.sfc_3080_tracedbamethyst.3080_amethyst_bk_t_testinfo_ae31': 'hdfs://bdos/minos/sqlserver/sfc_3080_tracedbamethyst/T_TestInfo_AE31/1591428980949',
......
'ods.acoustics.sfc_3080_tracedbamethyst.3080_amethyst_bk_table_id': 'hdfs://bdos/minos/sqlserver/sfc_3080_tracedbamethyst/Table_ID/1591428673333'}
pingo_table_list = splitDict(pingo_table_dict)
manager = Manager()
# return_list = manager.list() 也可以使用列表list
return_dict = manager.dict()
#进程池
p= multiprocessing.Pool(processes=5)
for i in range(5):
new_pingo_dict=pingo_table_list[i]
p.apply_async(search_table_space, args=(new_pingo_dict,return_dict))
p.close()
p.join()
stop_tiem = time.time()
t = stop_tiem - start_time
print(t)