SchedulerJob 类基于 BaseJob 实现。在 Airflow 中 Job 不同于 Task,Task 是一次调度运行实例,而 Job 是一系列调度 Task 实例组成,有自己的运行状态和开始、结束时间,每个 Task 通过 job_id 关联一个 Job 实例。
SchedulerJob 重载实现自己的 _execute 逻辑——初始化类 DagFileProcessorManager,传入参数——DAG 目录、DAG 目录中的文件列表、处理 DAG 的最大线程数、处理时间间隔和处理次数等,然后将 DagFileProcessorManager 实例作为参数传入方法 _execute_helper.
这样的 Job 类型有 3 种:ScheduleJob、LocalTaskJob 和 BackfillJob。其中 BackfillJob 实现一段时间的任务回溯执行,会启动多个任务实例。
刷新文件夹,找到新增和废弃的文件。对于新增的文件,放入DagFileProcessorProcess进行解析,对于丢弃文件,删除数据库中记录
解析文件,将找到的dag发给父进程
class SchedulerJob(BaseJob):
"""
调度任务
This SchedulerJob runs for a specific time interval and schedules the jobs
指定时间运行,调度ready的jobs。分析待运行的task的依赖是否被满足。
如果dag的依赖被满足,创建TaskInstances执行的命令发送到worker上执行
它对每个DAG中的每个任务执行此操作并重复
"""
def _execute(self):
# 调度器入口
self.log.info("Starting the scheduler")
# ping数据库确定能访问db
pessimistic_connection_handling()
# DAGs can be pickled for easier remote execution by some executors
# 是否应该序列号为pickled格式
pickle_dags = False
if self.do_pickle and self.executor.__class__ not in \
(executors.LocalExecutor, executors.SequentialExecutor):
pickle_dags = True
# Use multiple processes to parse and generate tasks for the
# DAGs in parallel. By processing them in separate processes,
# we can get parallelism and isolation from potentially harmful
# user code.
# 输出并发设定
self.log.info("Processing files using up to %s processes at a time", self.max_threads)
self.log.info("Running execute loop for %s seconds", self.run_duration)
self.log.info("Processing each file at most %s times", self.num_runs)
self.log.info("Process each file at most once every %s seconds", self.file_process_interval)
self.log.info("Checking for new files in %s every %s seconds", self.subdir, self.dag_dir_list_interval)
# Build up a list of Python files that could contain DAGs
self.log.info("Searching for files in %s", self.subdir)
# 设置dag读取文件夹 搜集python定义文件
known_file_paths = list_py_file_paths(self.subdir)
self.log.info("There are %s files in %s", len(known_file_paths), self.subdir)
# 处理器工厂,生成DagFileProcessor对象,下面会用到
def processor_factory(file_path):
# 子进程处理文件:
# file_path 需要处理的文件路径
# scheduleJob对象的参数
# pickle_dags:是否应该序列号为pickled格式
# dag_ids:如果指定则只调度指定的dag_ids, dag_id_white_list
return DagFileProcessor(file_path,
pickle_dags,
self.dag_ids)
# 处理dag定义文件 启动一个子进程处理dag文件
processor_manager = DagFileProcessorManager(self.subdir,
known_file_paths,
self.max_threads,
self.file_process_interval,
self.num_runs,
# 定义最大dag_run并发数
processor_factory)
try:
self._execute_helper(processor_manager)
finally:
self.log.info("Exited execute loop")
# 很重要的方法,确定调度器的主要流程
def _execute_helper(self, processor_manager):
"""
:param processor_manager: manager to use
:type processor_manager: DagFileProcessorManager
:return: None
将 DAG 的加载解析、任务提交调度和任务的执行全部包含在一起
processor_manager.heartbeat()
self._execute_task_instances()
self.executor.heartbeat()
self._process_executor_events()
"""
# 启动 CeleryExecutor 实例
self.executor.start()
session = settings.Session()
self.log.info("Resetting orphaned tasks for active dag runs")
self.reset_state_for_orphaned_tasks(session=session)
session.close()
execute_start_time = datetime.utcnow()
# Last time stats were printed
last_stat_print_time = datetime(2000, 1, 1)
# Last time that self.heartbeat() was called.
last_self_heartbeat_time = datetime.utcnow()
# Last time that the DAG dir was traversed to look for files
last_dag_dir_refresh_time = datetime.utcnow()
# Use this value initially
known_file_paths = processor_manager.file_paths
# For the execute duration, parse and schedule DAGs
while (datetime.utcnow() - execute_start_time).total_seconds() < \
self.run_duration or self.run_duration < 0:
self.log.debug("Starting Loop...")
loop_start_time = time.time()
# Traverse the DAG directory for Python files containing DAGs
# periodically
elapsed_time_since_refresh = (datetime.utcnow() -
last_dag_dir_refresh_time).total_seconds()
if elapsed_time_since_refresh > self.dag_dir_list_interval:
# 如果上次刷新时间超过了时间间隔
# Build up a list of Python files that could contain DAGs
self.log.info("Searching for files in %s", self.subdir)
# 重新刷新dag_file_lists
known_file_paths = list_py_file_paths(self.subdir)
last_dag_dir_refresh_time = datetime.utcnow()
self.log.info("There are %s files in %s", len(known_file_paths), self.subdir)
# 设置known_file_paths
processor_manager.set_file_paths(known_file_paths)
self.log.debug("Removing old import errors")
self.clear_nonexistent_import_errors(known_file_paths=known_file_paths)
# Kick of new processes and collect results from finished ones
self.log.info("Heartbeating the process manager")
simple_dags = processor_manager.heartbeat()
if self.using_sqlite:
# For the sqlite case w/ 1 thread, wait until the processor
# is finished to avoid concurrent access to the DB.
self.log.debug("Waiting for processors to finish since we're using sqlite")
processor_manager.wait_until_finished()
# Send tasks for execution if available
simple_dag_bag = SimpleDagBag(simple_dags)
if len(simple_dags) > 0:
# Handle cases where a DAG run state is set (perhaps manually) to
# a non-running state. Handle task instances that belong to
# DAG runs in those states
# If a task instance is up for retry but the corresponding DAG run
# isn't running, mark the task instance as FAILED so we don't try
# to re-run it.
# 处理设置为非运行状态的dag 如果是UP_FOR_RETRY直接改为FAILED
# 因为是更新过的new dag 所以重试的要置为失败
self._change_state_for_tis_without_dagrun(simple_dag_bag,
[State.UP_FOR_RETRY],
State.FAILED)
# If a task instance is scheduled or queued, but the corresponding
# DAG run isn't running, set the state to NONE so we don't try to
# re-run it.
# 任务实例处于调度或排队状态,但是对应的
# DAG run没有运行,将状态设置为NONE,这样我们就不会尝试
# 重新运行它。
self._change_state_for_tis_without_dagrun(simple_dag_bag,
[State.QUEUED,
State.SCHEDULED],
State.NONE)
# 执行task_instances
self._execute_task_instances(simple_dag_bag,
(State.SCHEDULED,))
# Call heartbeats
self.log.info("Heartbeating the executor")
# 处理这些处于排队待提交状态的任务
self.executor.heartbeat()
# Process events from the executor
# 是在 Scheduler 端处理任务完成的逻辑:如果任务成功且任务状态正常,则什么都不做。
# 如果任务完成,但是状态还处于队列排队中,则认为可能是外部人为更新了状态,
# 然后执行任务失败的处理逻辑包括发送邮件、任务重试等操作。
# 这个方法总体比较鸡肋,是用于冗余处理异常状态的逻辑。
self._process_executor_events(simple_dag_bag)
# Heartbeat the scheduler periodically
time_since_last_heartbeat = (datetime.utcnow() -
last_self_heartbeat_time).total_seconds()
if time_since_last_heartbeat > self.heartrate:
self.log.info("Heartbeating the scheduler")
self.heartbeat()
last_self_heartbeat_time = datetime.utcnow()
# Occasionally print out stats about how fast the files are getting processed
if ((datetime.utcnow() - last_stat_print_time).total_seconds() >
self.print_stats_interval):
if len(known_file_paths) > 0:
self._log_file_processing_stats(known_file_paths,
processor_manager)
last_stat_print_time = datetime.utcnow()
loop_end_time = time.time()
self.log.debug("Ran scheduling loop in %.2f seconds", loop_end_time - loop_start_time)
self.log.debug("Sleeping for %.2f seconds", self._processor_poll_interval)
time.sleep(self._processor_poll_interval)
# Exit early for a test mode
if processor_manager.max_runs_reached():
self.log.info("Exiting loop as all files have been processed %s times", self.num_runs)
break
# Stop any processors
processor_manager.terminate()
# Verify that all files were processed, and if so, deactivate DAGs that
# haven't been touched by the scheduler as they likely have been
# deleted.
all_files_processed = True
for file_path in known_file_paths:
if processor_manager.get_last_finish_time(file_path) is None:
all_files_processed = False
break
if all_files_processed:
self.log.info(
"Deactivating DAGs that haven't been touched since %s",
execute_start_time.isoformat()
)
models.DAG.deactivate_stale_dags(execute_start_time)
self.executor.end()
settings.Session.remove()
@provide_session
def process_file(self, file_path, pickle_dags=False, session=None):
"""
Process a Python file containing Airflow DAGs.
处理DAG文件,主要包含以下逻辑:
1\. 提取DAG对象,并保存到数据库
2\. 提取每个DAG的任务,并保存到数据库,同时检测任务是否满足可调度条件
3\. Kill长时间没有心跳信号的任务实例
This includes:
1. Execute the file and look for DAG objects in the namespace.
执行该文件并在名称空间中查找DAG对象
2. Pickle the DAG and save it to the DB (if necessary).
是否序列化文件中找到的dag和将它们保存到db
3. For each DAG, see what tasks should run and create appropriate task
instances in the DB.
对每一个dag 看tasks是否应该运行和在db中创建taskInstance
4. Record any errors importing the file into ORM
通过orm持久化errors
5. Kill (in ORM) any task instances belonging to the DAGs that haven't
issued a heartbeat in a while.
杀死没有归属dag的任务实例taskInstance
"""
simple_dags = []
dagbag = models.DagBag(file_path)
# Save individual DAGs in the ORM and update DagModel.last_scheduled_time4
# 提取DAG对象,并保存到数据库
for dag in dagbag.dags.values():
dag.sync_to_db()
# 暂停状态的dags
paused_dag_ids = [dag.dag_id for dag in dagbag.dags.values()
if dag.is_paused]
# Pickle the DAGs (if necessary) and put them into a SimpleDag
for dag_id in dagbag.dags:
dag = dagbag.get_dag(dag_id)
pickle_id = None
if pickle_dags:
# 生成序列化id
pickle_id = dag.pickle(session).id
# Only return DAGs that are not paused
# 不在暂停中
if dag_id not in paused_dag_ids:
# 生成SimpleDag 加入队列
simple_dags.append(SimpleDag(dag, pickle_id=pickle_id))
if len(self.dag_ids) > 0:
# 过滤出在在白名单内的非暂停的dags
dags = [dag for dag in dagbag.dags.values()
if dag.dag_id in self.dag_ids and
dag.dag_id not in paused_dag_ids]
else:
# 无白名单则去除parent_dag的id
dags = [dag for dag in dagbag.dags.values()
if not dag.parent_dag and
dag.dag_id not in paused_dag_ids]
ti_keys_to_schedule = []
# 遍历解析的 DAG,并将 DAGRun 和任务实例存储到数据库,同时将满足执行条件的任务加入队列,最后返回 simple_dags
self._process_dags(dagbag, dags, ti_keys_to_schedule)
for ti_key in ti_keys_to_schedule:
dag = dagbag.dags[ti_key[0]]
task = dag.get_task(ti_key[1])
# 从定义在code中的dag 内嵌的task 生成指定执行日期的TaskInstance 并存储至db
ti = models.TaskInstance(task, ti_key[2])
dep_context = DepContext(deps=QUEUE_DEPS, ignore_task_deps=True)
if ti.are_dependencies_met(
dep_context=dep_context,
session=session,
verbose=True):
# Task starts out in the scheduled state. All tasks in the
# scheduled state will be sent to the executor
ti.state = State.SCHEDULED
return simple_dags