spark的任务分析

1. 获取spark的设定参数

Monitoring and Instrumentation - Spark 3.3.1 Documentation

# -*- coding: utf-8 -*-


import requests
from functools import reduce
import json
import csv
import os


# 要查询的apps集合
def get_apps(spark_host, min_end_date, max_end_date):
    url = "http://%s:18080/api/v1/applications"%(spark_host)
    print('get_apps访问url:' + url)
    params = {
        "minEndDate": min_end_date,
        "maxEndDate": max_end_date,
        "status": "completed",
        #"limit": "3",
    }
    return  requests.get(url, params = params, timeout = 30).json()


# 某个app的spark_properties
def get_spark_properties(spark_host, app_id, attempt_num):
    env_var = '%s/environment'%( str(attempt_num) ) if attempt_num != 0 else 'environment'
    url = "http://%s:18080/api/v1/applications/%s/%s"%(spark_host, app_id, env_var)
    #print('get_spark_properties访问url:' + url )
    envs = requests.get(url, timeout = 30).json()['sparkProperties']
    return dict( [(env[0], env[1]) for env in envs] )


# 某个集群某个时间段的APP信息
def get_emr_apps(spark_host, emr_id, emr_name, min_end_date, max_end_date, oss_path):
    # 打开写出文件流
    tmp_path = f"/tmp/{emr_id}_{emr_name}_${dh}.csv"
    csvfile = open(tmp_path, 'w+')
    writer = csv.writer(csvfile, delimiter='\u0001', lineterminator='\n', quoting=csv.QUOTE_NONE,escapechar= '\\' )
    
    # 解析信息
    apps=get_apps(spark_host,min_end_date,max_end_date)
    for app in apps:
        app_id = app['id']
        app_name = app['name']
        attempts = app['attempts']
        attempt = reduce(lambda x, y:  x if x.startTimeEpoch > y.startTimeEpoch else y, attempts) 
        user=attempt['sparkUser']
        start_time_epoch=attempt['startTimeEpoch']
        end_time_epoch=attempt['endTimeEpoch']
        attempt_num = int( attempt['attemptId'] if 'attemptId' in attempt else '0' )
        spark_properties=get_spark_properties(spark_host, app_id, attempt_num)
        writer.writerow([
            emr_id,
            emr_name,
            app_id,
            app_name,
            attempt_num,
            start_time_epoch,
            end_time_epoch,
            spark_properties
        ])
        
    # 关闭文件流, 并上传到oss
    csvfile.close()
    bash_command = f"hdfs dfs -mkdir -p {oss_path} && hdfs dfs -put -f {tmp_path} {oss_path}"
    print(bash_command)
    os.system(bash_command)


spark_host='xxx'
emr_id='xxx'
emr_name='xxx'
min_end_date='${dt_f}T${dh}:00:00.000GMT+08:00'
max_end_date='${dt_f}T${dh}:59:59.999GMT+08:00'
oss_path='${oss_path}'
get_emr_apps( spark_host, emr_id, emr_name, min_end_date, max_end_date, oss_path)

你可能感兴趣的:(spark,大数据,经验分享)