从Job.py SchedulerJob类的_execute方法开始
def _execute(self):
self.log.info("Starting the scheduler")
# DAGs can be pickled for easier remote execution by some executors
pickle_dags = False
if self.do_pickle and self.executor.__class__ not in \
(executors.LocalExecutor, executors.SequentialExecutor):
pickle_dags = True
self.log.info("Running execute loop for %s seconds", self.run_duration)
self.log.info("Processing each file at most %s times", self.num_runs)
# Build up a list of Python files that could contain DAGs
self.log.info("Searching for files in %s", self.subdir)
known_file_paths = list_py_file_paths(self.subdir)
self.log.info("There are %s files in %s", len(known_file_paths), self.subdir)
// 以下方法返回真正进行dag解析的类
def processor_factory(file_path, zombies):
return DagFileProcessor(file_path,
pickle_dags,
self.dag_ids,
zombies)
# When using sqlite, we do not use async_mode
# so the scheduler job and DAG parser don't access the DB at the same time.
async_mode = not self.using_sqlite
// 以下为dag解析的入口类
self.processor_agent = DagFileProcessorAgent(self.subdir,
known_file_paths,
self.num_runs,
processor_factory,
async_mode)
try:
self._execute_helper()
finally:
self.processor_agent.end()
self.log.info("Exited execute loop")
到DagFileProcessorAgent类的start()方法:
"""
Launch DagFileProcessorManager processor and start DAG parsing loop in manager.
"""
self._process = self._launch_process(self._dag_directory,
self._file_paths,
self._max_runs,
self._processor_factory,
self._child_signal_conn,
self._stat_queue,
self._result_queue,
self._async_mode)
self.log.info("Launched DagFileProcessorManager with pid: {}"
.format(self._process.pid))
再到DagFileProcessorAgent类的_launch_process方法
@staticmethod
def _launch_process(dag_directory,
file_paths,
max_runs,
processor_factory,
signal_conn,
_stat_queue,
result_queue,
async_mode):
def helper():
# Reload configurations and settings to avoid collision with parent process.
# Because this process may need custom configurations that cannot be shared,
# e.g. RotatingFileHandler. And it can cause connection corruption if we
# do not recreate the SQLA connection pool.
os.environ['CONFIG_PROCESSOR_MANAGER_LOGGER'] = 'True'
# Replicating the behavior of how logging module was loaded
# in logging_config.py
reload_module(import_module(logging_class_path.rsplit('.', 1)[0]))
reload_module(airflow.settings)
del os.environ['CONFIG_PROCESSOR_MANAGER_LOGGER']
processor_manager = DagFileProcessorManager(dag_directory,
file_paths,
max_runs,
processor_factory,
signal_conn,
_stat_queue,
result_queue,
async_mode)
processor_manager.start()
p = multiprocessing.Process(target=helper,
args=(),
name="DagFileProcessorManager")
p.start()
return p
再到DagFileProcessorManager类的start_in_async方法:
def start_in_async(self):
"""
Parse DAG files repeatedly in a standalone loop.
"""
while True:
loop_start_time = time.time()
if self._signal_conn.poll():
agent_signal = self._signal_conn.recv()
if agent_signal == DagParsingSignal.TERMINATE_MANAGER:
self.terminate()
break
elif agent_signal == DagParsingSignal.END_MANAGER:
self.end()
sys.exit(os.EX_OK)
self._refresh_dag_dir()
simple_dags = self.heartbeat()
for simple_dag in simple_dags:
self._result_queue.put(simple_dag)
self._print_stat()
all_files_processed = all(self.get_last_finish_time(x) is not None
for x in self.file_paths)
max_runs_reached = self.max_runs_reached()
dag_parsing_stat = DagParsingStat(self._file_paths,
self.get_all_pids(),
max_runs_reached,
all_files_processed,
len(simple_dags))
self._stat_queue.put(dag_parsing_stat)
if max_runs_reached:
self.log.info("Exiting dag parsing loop as all files "
"have been processed %s times", self._max_runs)
break
loop_duration = time.time() - loop_start_time
if loop_duration < 1:
sleep_length = 1 - loop_duration
self.log.debug("Sleeping for {0:.2f} seconds "
"to prevent excessive logging".format(sleep_length))
time.sleep(sleep_length)
再到DagFileProcessorManager的heartbeat方法:
def heartbeat(self):
"""
This should be periodically called by the manager loop. This method will
kick off new processes to process DAG definition files and read the
results from the finished processors.
:return: a list of SimpleDags that were produced by processors that
have finished since the last time this was called
:rtype: list[SimpleDag]
"""
finished_processors = {}
""":type : dict[unicode, AbstractDagFileProcessor]"""
running_processors = {}
""":type : dict[unicode, AbstractDagFileProcessor]"""
for file_path, processor in self._processors.items():
if processor.done:
self.log.debug("Processor for %s finished", file_path)
now = timezone.utcnow()
finished_processors[file_path] = processor
self._last_runtime[file_path] = (now -
processor.start_time).total_seconds()
self._last_finish_time[file_path] = now
self._run_count[file_path] += 1
else:
running_processors[file_path] = processor
self._processors = running_processors
self.log.debug("%s/%s DAG parsing processes running",
len(self._processors), self._parallelism)
self.log.debug("%s file paths queued for processing",
len(self._file_path_queue))
# Collect all the DAGs that were found in the processed files
simple_dags = []
for file_path, processor in finished_processors.items():
if processor.result is None:
self.log.warning(
"Processor for %s exited with return code %s.",
processor.file_path, processor.exit_code
)
else:
for simple_dag in processor.result:
simple_dags.append(simple_dag)
# Generate more file paths to process if we processed all the files
# already.
if len(self._file_path_queue) == 0:
# If the file path is already being processed, or if a file was
# processed recently, wait until the next batch
file_paths_in_progress = self._processors.keys()
now = timezone.utcnow()
file_paths_recently_processed = []
for file_path in self._file_paths:
last_finish_time = self.get_last_finish_time(file_path)
if (last_finish_time is not None and
(now - last_finish_time).total_seconds() <
self._file_process_interval):
file_paths_recently_processed.append(file_path)
files_paths_at_run_limit = [file_path
for file_path, num_runs in self._run_count.items()
if num_runs == self._max_runs]
files_paths_to_queue = list(set(self._file_paths) -
set(file_paths_in_progress) -
set(file_paths_recently_processed) -
set(files_paths_at_run_limit))
for file_path, processor in self._processors.items():
self.log.debug(
"File path %s is still being processed (started: %s)",
processor.file_path, processor.start_time.isoformat()
)
self.log.debug(
"Queuing the following files for processing:\n\t%s",
"\n\t".join(files_paths_to_queue)
)
self._file_path_queue.extend(files_paths_to_queue)
zombies = self._find_zombies()
# Start more processors if we have enough slots and files to process
while (self._parallelism - len(self._processors) > 0 and
len(self._file_path_queue) > 0):
file_path = self._file_path_queue.pop(0)
processor = self._processor_factory(file_path, zombies)
processor.start()
self.log.debug(
"Started a process (PID: %s) to generate tasks for %s",
processor.pid, file_path
)
self._processors[file_path] = processor
# Update heartbeat count.
self._run_count[self._heart_beat_key] += 1
return simple_dags
再到DagFileProcessor类的start方法:
def start(self):
"""
Launch the process and start processing the DAG.
"""
self._process = DagFileProcessor._launch_process(
self._result_queue,
self.file_path,
self._pickle_dags,
self._dag_id_white_list,
"DagFileProcessor{}".format(self._instance_id),
self._zombies)
self._start_time = timezone.utcnow()
到DagFileProcessor类的_launch_process方法:
@staticmethod
def _launch_process(result_queue,
file_path,
pickle_dags,
dag_id_white_list,
thread_name,
zombies):
"""
Launch a process to process the given file.
:param result_queue: the queue to use for passing back the result
:type result_queue: multiprocessing.Queue
:param file_path: the file to process
:type file_path: unicode
:param pickle_dags: whether to pickle the DAGs found in the file and
save them to the DB
:type pickle_dags: bool
:param dag_id_white_list: if specified, only examine DAG ID's that are
in this list
:type dag_id_white_list: list[unicode]
:param thread_name: the name to use for the process that is launched
:type thread_name: unicode
:return: the process that was launched
:rtype: multiprocessing.Process
:param zombies: zombie task instances to kill
:type zombies: list[SimpleTaskInstance]
"""
def helper():
# This helper runs in the newly created process
log = logging.getLogger("airflow.processor")
stdout = StreamLogWriter(log, logging.INFO)
stderr = StreamLogWriter(log, logging.WARN)
set_context(log, file_path)
try:
# redirect stdout/stderr to log
sys.stdout = stdout
sys.stderr = stderr
# Re-configure the ORM engine as there are issues with multiple processes
settings.configure_orm()
# Change the thread name to differentiate log lines. This is
# really a separate process, but changing the name of the
# process doesn't work, so changing the thread name instead.
threading.current_thread().name = thread_name
start_time = time.time()
log.info("Started process (PID=%s) to work on %s",
os.getpid(), file_path)
scheduler_job = SchedulerJob(dag_ids=dag_id_white_list, log=log)
result = scheduler_job.process_file(file_path,
zombies,
pickle_dags)
result_queue.put(result)
end_time = time.time()
log.info(
"Processing %s took %.3f seconds", file_path, end_time - start_time
)
except Exception:
# Log exceptions through the logging framework.
log.exception("Got an exception! Propagating...")
raise
finally:
sys.stdout = sys.__stdout__
sys.stderr = sys.__stderr__
# We re-initialized the ORM within this Process above so we need to
# tear it down manually here
settings.dispose_orm()
p = multiprocessing.Process(target=helper,
args=(),
name="{}-Process".format(thread_name))
p.start()
return p
再到SchedulerJob类的process_file方法(绕了一圈又回到SchedulerJob了。。):
@provide_session
def process_file(self, file_path, zombies, pickle_dags=False, session=None):
"""
Process a Python file containing Airflow DAGs.
This includes:
1. Execute the file and look for DAG objects in the namespace.
2. Pickle the DAG and save it to the DB (if necessary).
3. For each DAG, see what tasks should run and create appropriate task
instances in the DB.
4. Record any errors importing the file into ORM
5. Kill (in ORM) any task instances belonging to the DAGs that haven't
issued a heartbeat in a while.
Returns a list of SimpleDag objects that represent the DAGs found in
the file
:param file_path: the path to the Python file that should be executed
:type file_path: unicode
:param zombies: zombie task instances to kill.
:type zombies: list[SimpleTaskInstance]
:param pickle_dags: whether serialize the DAGs found in the file and
save them to the db
:type pickle_dags: bool
:return: a list of SimpleDags made from the Dags found in the file
:rtype: list[SimpleDag]
"""
self.log.info("Processing file %s for tasks to queue", file_path)
# As DAGs are parsed from this file, they will be converted into SimpleDags
simple_dags = []
try:
// 以下行收集dagbag,从airflow home路径下面递归扫描dags
// 通过imp.load_source(mod_name, filepath)收集dag
// 整个dag图在执行import的时候通过python代码生成了
dagbag = models.DagBag(file_path, include_examples=False)
except Exception:
self.log.exception("Failed at reloading the DAG file %s", file_path)
Stats.incr('dag_file_refresh_error', 1, 1)
return []
if len(dagbag.dags) > 0:
self.log.info("DAG(s) %s retrieved from %s", dagbag.dags.keys(), file_path)
else:
self.log.warning("No viable dags retrieved from %s", file_path)
self.update_import_errors(session, dagbag)
return []
# Save individual DAGs in the ORM and update DagModel.last_scheduled_time
for dag in dagbag.dags.values():
dag.sync_to_db()
paused_dag_ids = [dag.dag_id for dag in dagbag.dags.values()
if dag.is_paused]
# Pickle the DAGs (if necessary) and put them into a SimpleDag
for dag_id in dagbag.dags:
dag = dagbag.get_dag(dag_id)
pickle_id = None
if pickle_dags:
pickle_id = dag.pickle(session).id
# Only return DAGs that are not paused
if dag_id not in paused_dag_ids:
simple_dags.append(SimpleDag(dag, pickle_id=pickle_id))
if len(self.dag_ids) > 0:
dags = [dag for dag in dagbag.dags.values()
if dag.dag_id in self.dag_ids and
dag.dag_id not in paused_dag_ids]
else:
dags = [dag for dag in dagbag.dags.values()
if not dag.parent_dag and
dag.dag_id not in paused_dag_ids]
# Not using multiprocessing.Queue() since it's no longer a separate
# process and due to some unusual behavior. (empty() incorrectly
# returns true?)
ti_keys_to_schedule = []
self._process_dags(dagbag, dags, ti_keys_to_schedule)
for ti_key in ti_keys_to_schedule:
dag = dagbag.dags[ti_key[0]]
task = dag.get_task(ti_key[1])
ti = models.TaskInstance(task, ti_key[2])
ti.refresh_from_db(session=session, lock_for_update=True)
# We can defer checking the task dependency checks to the worker themselves
# since they can be expensive to run in the scheduler.
dep_context = DepContext(deps=QUEUE_DEPS, ignore_task_deps=True)
# Only schedule tasks that have their dependencies met, e.g. to avoid
# a task that recently got its state changed to RUNNING from somewhere
# other than the scheduler from getting its state overwritten.
# TODO(aoen): It's not great that we have to check all the task instance
# dependencies twice; once to get the task scheduled, and again to actually
# run the task. We should try to come up with a way to only check them once.
if ti.are_dependencies_met(
dep_context=dep_context,
session=session,
verbose=True):
# Task starts out in the scheduled state. All tasks in the
# scheduled state will be sent to the executor
ti.state = State.SCHEDULED
# Also save this task instance to the DB.
self.log.info("Creating / updating %s in ORM", ti)
session.merge(ti)
# commit batch
session.commit()
# Record import errors into the ORM
try:
self.update_import_errors(session, dagbag)
except Exception:
self.log.exception("Error logging import errors!")
try:
dagbag.kill_zombies(zombies)
except Exception:
self.log.exception("Error killing zombies!")
return simple_dags
到SchedulerJob类的_process_dags方法:
def _process_dags(self, dagbag, dags, tis_out):
"""
Iterates over the dags and processes them. Processing includes:
1. Create appropriate DagRun(s) in the DB.
2. Create appropriate TaskInstance(s) in the DB.
3. Send emails for tasks that have missed SLAs.
:param dagbag: a collection of DAGs to process
:type dagbag: models.DagBag
:param dags: the DAGs from the DagBag to process
:type dags: DAG
:param tis_out: A queue to add generated TaskInstance objects
:type tis_out: multiprocessing.Queue[TaskInstance]
:return: None
"""
for dag in dags:
dag = dagbag.get_dag(dag.dag_id)
if dag.is_paused:
self.log.info("Not processing DAG %s since it's paused", dag.dag_id)
continue
if not dag:
self.log.error("DAG ID %s was not found in the DagBag", dag.dag_id)
continue
self.log.info("Processing %s", dag.dag_id)
// 下面一行生成最新周期的dag_run表
dag_run = self.create_dag_run(dag)
if dag_run:
self.log.info("Created %s", dag_run)
// 下面一行生成一个符合执行要求的任务队列
self._process_task_instances(dag, tis_out)
self.manage_slas(dag)
models.DagStat.update([d.dag_id for d in dags])
到SchedulerJob类的_process_task_instances方法:
@provide_session
def _process_task_instances(self, dag, queue, session=None):
"""
This method schedules the tasks for a single DAG by looking at the
active DAG runs and adding task instances that should run to the
queue.
"""
# update the state of the previously active dag runs
dag_runs = DagRun.find(dag_id=dag.dag_id, state=State.RUNNING, session=session)
active_dag_runs = []
for run in dag_runs:
self.log.info("Examining DAG run %s", run)
# don't consider runs that are executed in the future
if run.execution_date > timezone.utcnow():
self.log.error(
"Execution date is in future: %s",
run.execution_date
)
continue
if len(active_dag_runs) >= dag.max_active_runs:
self.log.info("Active dag runs > max_active_run.")
continue
# skip backfill dagruns for now as long as they are not really scheduled
if run.is_backfill:
continue
# todo: run.dag is transient but needs to be set
run.dag = dag
# todo: preferably the integrity check happens at dag collection time
run.verify_integrity(session=session)
run.update_state(session=session)
if run.state == State.RUNNING:
make_transient(run)
active_dag_runs.append(run)
for run in active_dag_runs:
self.log.debug("Examining active DAG run: %s", run)
# this needs a fresh session sometimes tis get detached
tis = run.get_task_instances(state=(State.NONE,
State.UP_FOR_RETRY,
State.UP_FOR_RESCHEDULE))
# this loop is quite slow as it uses are_dependencies_met for
# every task (in ti.is_runnable). This is also called in
# update_state above which has already checked these tasks
for ti in tis:
task = dag.get_task(ti.task_id)
# fixme: ti.task is transient but needs to be set
ti.task = task
# future: remove adhoc
if task.adhoc:
continue
// 以下执行各种依赖类决定某个Taskinstance是否要加入执行队列
if ti.are_dependencies_met(
dep_context=DepContext(flag_upstream_failed=True),
session=session):
self.log.debug('Queuing task: %s', ti)
queue.append(ti.key)