本地执行器是和调度器在同一台机器上一起执行的执行器。本地执行器可以有限并行度(进程池)执行,也可以以无限并行度的方式执行。
![image-20230309214349385](/Users/linglingdai/Library/Application Support/typora-user-images/image-20230309214349385.png)
不同的版本实现不一样,本文依据1.9版本分析,是通过设置并行度启动不同数量的localwoker控制并发
int = conf.getint('core', 'PARALLELISM') 配置
基于一种生产消费者模型,在一台服务器内,基于LocalExecutor进程的锁安全,启动多线程,LocalExecutor接受taskinstance放入self.queue队列,多个localWoker去线程安全的消费队列中的元素
class LocalExecutor(BaseExecutor):
"""
LocalExecutor executes tasks locally in parallel. It uses the
multiprocessing Python library and queues to parallelize the execution
使用python多线程包 使用队列并发执行task
of tasks.
"""
def start(self):
self.queue = multiprocessing.JoinableQueue()
self.result_queue = multiprocessing.Queue()
# 根据并发度启动本地执行worker
self.workers = [
LocalWorker(self.queue, self.result_queue)
for _ in range(self.parallelism)
]
for w in self.workers:
w.start()
def execute_async(self, key, command, queue=None):
# 异步执行放入相关队列
self.queue.put((key, command))
def sync(self):
while not self.result_queue.empty():
# 同步则是挨个等待执行结果 并改变状态
results = self.result_queue.get()
self.change_state(*results)
class LocalWorker(multiprocessing.Process, LoggingMixin):
def __init__(self, task_queue, result_queue):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.daemon = True
def run(self):
while True:
# 不断循环读取queue中的元素,没有时则阻塞
key, command = self.task_queue.get()
if key is None:
# Received poison pill, no more tasks to run
self.task_queue.task_done()
break
self.log.info("%s running %s", self.__class__.__name__, command)
command = "exec bash -c '{0}'".format(command)
try:
# 启动子进程执行命令
subprocess.check_call(command, shell=True)
state = State.SUCCESS
except subprocess.CalledProcessError as e:
state = State.FAILED
self.log.error("Failed to execute task %s.", str(e))
# TODO: Why is this commented out?
# raise e
# 结果加入result_queue
self.result_queue.put((key, state))
# 标记队列运行数-1
self.task_queue.task_done()
time.sleep(1) # 防止过度消耗cpu 执行完一次休眠1s
任务运行器以子进程的方式运行任务,相当于在命令行中输入一行新的命令。
供schedule调度器使用的生成job执行 airflow run 触发cli中run命令,并进一步激发executor执行taskInsatance的组件。
1,任务运行期通过心跳机制,检测任务的存活以及对任务进行干预。
2,任务的执行通过子进程方式展开。
由于有的任务需要长期执行,因此需要将任务的状态写入数据库,以方便监控;
通过心跳回调,可以对任务进行干预
前端发送一个停止任务的信号,则可以后端直接修改数据库中任务的状态,当发生心跳回调时,查询数据库中任务的状态,然后停止任务
run()维护数据库内job状态,并实际调用self._execute(),触发作业job执行
class BaseJob(Base, LoggingMixin):
# 字段名
__tablename__ = "job"
id = Column(Integer, primary_key=True)
dag_id = Column(String(ID_LEN),)
state = Column(String(20))
job_type = Column(String(30)) # 作业类型
start_date = Column(DateTime())
end_date = Column(DateTime())
latest_heartbeat = Column(DateTime()) # 最后一次心跳状态
executor_class = Column(String(500)) # 运行器类型
hostname = Column(String(500)) #运行主机
unixname = Column(String(1000)) # 用户名
__mapper_args__ = {
'polymorphic_on': job_type,
'polymorphic_identity': 'BaseJob'
}
__table_args__ = (
Index('job_type_heart', job_type, latest_heartbeat),
)
def __init__(
self,
executor=executors.GetDefaultExecutor(),
heartrate=conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC'),
*args, **kwargs):
self.hostname = socket.getfqdn()
self.executor = executor
# 执行器的类名字
self.executor_class = executor.__class__.__name__
self.start_date = datetime.utcnow()
self.latest_heartbeat = datetime.utcnow()
self.heartrate = heartrate
self.unixname = getpass.getuser()
super(BaseJob, self).__init__(*args, **kwargs)
def is_alive(self):
return (
(datetime.utcnow() - self.latest_heartbeat).seconds <
(conf.getint('scheduler', 'JOB_HEARTBEAT_SEC') * 2.1)
)
def kill(self):
self.on_kill()
def on_kill(self):
pass
def heartbeat_callback(self, session=None):
pass
def heartbeat(self):
'''
周期性更新心跳状态
Heartbeats update the job's entry in the database with a timestamp
for the latest_heartbeat and allows for the job to be killed
externally. This allows at the system level to monitor what is
actually active.
For instance, an old heartbeat for SchedulerJob would mean something
is wrong.
This also allows for any job to be killed externally, regardless
of who is running it or on which machine it is running.
Note that if your heartbeat is set to 60 seconds and you call this
method after 10 seconds of processing since the last heartbeat, it
will sleep 50 seconds to complete the 60 seconds and keep a steady
heart rate. If you go over 60 seconds before calling it, it won't
sleep at all.
'''
# by job_id get job
session = settings.Session()
job = session.query(BaseJob).filter_by(id=self.id).one()
make_transient(job)
session.commit()
session.close()
# 如果SHUTDOWN则kill
if job.state == State.SHUTDOWN:
self.kill()
# Figure out how long to sleep for
sleep_for = 0
# latest_heartbeat不为空
if job.latest_heartbeat:
# 计算睡眠时间
sleep_for = max(
0,
self.heartrate - (datetime.utcnow() - job.latest_heartbeat).total_seconds())
# Don't keep session open while sleeping as it leaves a connection open
session.close()
# 休眠
sleep(sleep_for)
# Update last heartbeat time
# 更新心跳时间
session = settings.Session()
job = session.query(BaseJob).filter(BaseJob.id == self.id).first()
job.latest_heartbeat = datetime.utcnow()
session.merge(job)
session.commit()
# 执行实现类的心跳call back
self.heartbeat_callback(session=session)
session.close()
self.log.debug('[heart] Boom.')
def run(self):
# 模板方法run中,在_execute方法执行前后进行数据库记录的更新,这里主要是任务状态
# 增加一个指定的计数器,用于记录任务的开始时间。
Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1)
# Adding an entry in the DB
# 创建数据库session,save job to db
session = settings.Session()
self.state = State.RUNNING
session.add(self)
session.commit()
id_ = self.id
# 分离对象,断开对象和db的映射关系,使其在不同的线程和进程中传递了。
make_transient(self)
self.id = id_
# Run 实际的job执行
self._execute()
# Marking the success in the DB
self.end_date = datetime.utcnow()
self.state = State.SUCCESS
# 再次save状态todb
session.merge(self)
session.commit()
session.close()
# 增加一个指定的计数器,用于记录任务的结束时间。
Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1)
def _execute(self):
raise NotImplementedError("This method needs to be overridden")
@provide_session
def reset_state_for_orphaned_tasks(self, filter_by_dag_run=None, session=None):
"""
当数据库中任务实例的状态为被调度或者进入队列,而该任务不存在运行器中,将其任务的状态转
为None,然后由调度器重新放入任务队列中
"""
# 执行器中队列任务
queued_tis = self.executor.queued_tasks
# also consider running as the state might not have changed in the db yet
# 正在运行的任务
running_tis = self.executor.running
# 查询数据库-->数据库中的状态为调度或者进入队列状态
resettable_states = [State.SCHEDULED, State.QUEUED]
TI = models.TaskInstance
DR = models.DagRun
if filter_by_dag_run is None:
# 查出数据库中正在运行的task
resettable_tis = (
session
.query(TI)
.join(
DR,
and_(
TI.dag_id == DR.dag_id,
TI.execution_date == DR.execution_date))
.filter(
DR.state == State.RUNNING,
DR.external_trigger == False,
DR.run_id.notlike(BackfillJob.ID_PREFIX + '%'),
TI.state.in_(resettable_states))).all()
else:
# 由DagRun实例进行获取任务实例
resettable_tis = filter_by_dag_run.get_task_instances(state=resettable_states,
session=session)
# 需要重的任务 数据库显示已经在运行或者进入队列,而实际上executor并未感知
tis_to_reset = []
# Can't use an update here since it doesn't support joins
for ti in resettable_tis:
# 既不在排队 也不在执行中
if ti.key not in queued_tis and ti.key not in running_tis:
tis_to_reset.append(ti)
# 拼sql 查询
filter_for_tis = ([and_(TI.dag_id == ti.dag_id,
TI.task_id == ti.task_id,
TI.execution_date == ti.execution_date)
for ti in tis_to_reset])
if len(tis_to_reset) == 0:
return []
reset_tis = (
session
.query(TI)
.filter(or_(*filter_for_tis), TI.state.in_(resettable_states))
.with_for_update()
.all())
# 重置状态
for ti in reset_tis:
ti.state = State.NONE
session.merge(ti)
task_instance_str = '\n\t'.join(
["{}".format(x) for x in reset_tis])
session.commit()
self.log.info(
"Reset the following %s TaskInstances:\n\t%s",
len(reset_tis), task_instance_str
)
return reset_tis