airflow源码精读 八

LocalExecutor 本地执行器

本地执行器是和调度器在同一台机器上一起执行的执行器。本地执行器可以有限并行度(进程池)执行,也可以以无限并行度的方式执行。

![image-20230309214349385](/Users/linglingdai/Library/Application Support/typora-user-images/image-20230309214349385.png)

  • 不同的版本实现不一样,本文依据1.9版本分析,是通过设置并行度启动不同数量的localwoker控制并发

  • int = conf.getint('core', 'PARALLELISM') 配置

基于一种生产消费者模型,在一台服务器内,基于LocalExecutor进程的锁安全,启动多线程,LocalExecutor接受taskinstance放入self.queue队列,多个localWoker去线程安全的消费队列中的元素

class LocalExecutor(BaseExecutor):
    """
    LocalExecutor executes tasks locally in parallel. It uses the
    multiprocessing Python library and queues to parallelize the execution
    使用python多线程包 使用队列并发执行task
    of tasks.
    """

    def start(self):
        self.queue = multiprocessing.JoinableQueue()
        self.result_queue = multiprocessing.Queue()
        # 根据并发度启动本地执行worker
        self.workers = [
            LocalWorker(self.queue, self.result_queue)
            for _ in range(self.parallelism)
        ]

        for w in self.workers:
            w.start()

    def execute_async(self, key, command, queue=None):
        # 异步执行放入相关队列
        self.queue.put((key, command))

    def sync(self):
        while not self.result_queue.empty():
            # 同步则是挨个等待执行结果 并改变状态
            results = self.result_queue.get()
            self.change_state(*results)

class LocalWorker(multiprocessing.Process, LoggingMixin):
    def __init__(self, task_queue, result_queue):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.daemon = True

    def run(self):
        while True:
            # 不断循环读取queue中的元素,没有时则阻塞
            key, command = self.task_queue.get()
            if key is None:
                # Received poison pill, no more tasks to run
                self.task_queue.task_done()
                break
            self.log.info("%s running %s", self.__class__.__name__, command)
            command = "exec bash -c '{0}'".format(command)
            try:
                # 启动子进程执行命令
                subprocess.check_call(command, shell=True)
                state = State.SUCCESS
            except subprocess.CalledProcessError as e:
                state = State.FAILED
                self.log.error("Failed to execute task %s.", str(e))
                # TODO: Why is this commented out?
                # raise e
            # 结果加入result_queue
            self.result_queue.put((key, state))
            # 标记队列运行数-1
            self.task_queue.task_done()
            time.sleep(1) # 防止过度消耗cpu 执行完一次休眠1s
            

任务调度器

任务运行器以子进程的方式运行任务,相当于在命令行中输入一行新的命令。

供schedule调度器使用的生成job执行 airflow run 触发cli中run命令,并进一步激发executor执行taskInsatance的组件。

1,任务运行期通过心跳机制,检测任务的存活以及对任务进行干预。

2,任务的执行通过子进程方式展开。

  • 由于有的任务需要长期执行,因此需要将任务的状态写入数据库,以方便监控;

  • 通过心跳回调,可以对任务进行干预

  • 前端发送一个停止任务的信号,则可以后端直接修改数据库中任务的状态,当发生心跳回调时,查询数据库中任务的状态,然后停止任务

BaseJob 作业基础类

run()维护数据库内job状态,并实际调用self._execute(),触发作业job执行

class BaseJob(Base, LoggingMixin):
    # 字段名
    __tablename__ = "job"

    id = Column(Integer, primary_key=True)
    dag_id = Column(String(ID_LEN),)
    state = Column(String(20))
    job_type = Column(String(30)) # 作业类型
    start_date = Column(DateTime())
    end_date = Column(DateTime())
    latest_heartbeat = Column(DateTime()) # 最后一次心跳状态
    executor_class = Column(String(500)) # 运行器类型
    hostname = Column(String(500)) #运行主机
    unixname = Column(String(1000)) # 用户名

    __mapper_args__ = {
        'polymorphic_on': job_type,
        'polymorphic_identity': 'BaseJob'
    }

    __table_args__ = (
        Index('job_type_heart', job_type, latest_heartbeat),
    )

    def __init__(
            self,
            executor=executors.GetDefaultExecutor(),
            heartrate=conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC'),
            *args, **kwargs):
        self.hostname = socket.getfqdn()
        self.executor = executor
        # 执行器的类名字
        self.executor_class = executor.__class__.__name__
        self.start_date = datetime.utcnow()
        self.latest_heartbeat = datetime.utcnow()
        self.heartrate = heartrate
        self.unixname = getpass.getuser()
        super(BaseJob, self).__init__(*args, **kwargs)

    def is_alive(self):
        return (
            (datetime.utcnow() - self.latest_heartbeat).seconds <
            (conf.getint('scheduler', 'JOB_HEARTBEAT_SEC') * 2.1)
        )

    def kill(self):
            self.on_kill()


    def on_kill(self):
        pass

    def heartbeat_callback(self, session=None):
        pass

    def heartbeat(self):
        '''
        周期性更新心跳状态
        Heartbeats update the job's entry in the database with a timestamp
        for the latest_heartbeat and allows for the job to be killed
        externally. This allows at the system level to monitor what is
        actually active.

        For instance, an old heartbeat for SchedulerJob would mean something
        is wrong.

        This also allows for any job to be killed externally, regardless
        of who is running it or on which machine it is running.

        Note that if your heartbeat is set to 60 seconds and you call this
        method after 10 seconds of processing since the last heartbeat, it
        will sleep 50 seconds to complete the 60 seconds and keep a steady
        heart rate. If you go over 60 seconds before calling it, it won't
        sleep at all.
        '''
        # by job_id get job
        session = settings.Session()
        job = session.query(BaseJob).filter_by(id=self.id).one()
        make_transient(job)
        session.commit()
        session.close()
        # 如果SHUTDOWN则kill
        if job.state == State.SHUTDOWN:
            self.kill()

        # Figure out how long to sleep for
        sleep_for = 0
        # latest_heartbeat不为空
        if job.latest_heartbeat:
            # 计算睡眠时间
            sleep_for = max(
                0,
                self.heartrate - (datetime.utcnow() - job.latest_heartbeat).total_seconds())

        # Don't keep session open while sleeping as it leaves a connection open
        session.close()
        # 休眠
        sleep(sleep_for)

        # Update last heartbeat time
        # 更新心跳时间
        session = settings.Session()
        job = session.query(BaseJob).filter(BaseJob.id == self.id).first()
        job.latest_heartbeat = datetime.utcnow()
        session.merge(job)
        session.commit()
        # 执行实现类的心跳call back
        self.heartbeat_callback(session=session)
        session.close()
        self.log.debug('[heart] Boom.')

    def run(self):
        # 模板方法run中,在_execute方法执行前后进行数据库记录的更新,这里主要是任务状态
        # 增加一个指定的计数器,用于记录任务的开始时间。
        Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1)
        # Adding an entry in the DB
        # 创建数据库session,save job to db
        session = settings.Session()
        self.state = State.RUNNING
        session.add(self)
        session.commit()
        id_ = self.id
        # 分离对象,断开对象和db的映射关系,使其在不同的线程和进程中传递了。
        make_transient(self)
        self.id = id_

        # Run 实际的job执行
        self._execute()

        # Marking the success in the DB
        self.end_date = datetime.utcnow()
        self.state = State.SUCCESS
        # 再次save状态todb
        session.merge(self)
        session.commit()
        session.close()
        # 增加一个指定的计数器,用于记录任务的结束时间。
        Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1)

    def _execute(self):
        raise NotImplementedError("This method needs to be overridden")
        
    @provide_session
    def reset_state_for_orphaned_tasks(self, filter_by_dag_run=None, session=None):
        """
        当数据库中任务实例的状态为被调度或者进入队列,而该任务不存在运行器中,将其任务的状态转
        为None,然后由调度器重新放入任务队列中
        """
        # 执行器中队列任务
        queued_tis = self.executor.queued_tasks
        # also consider running as the state might not have changed in the db yet
        # 正在运行的任务
        running_tis = self.executor.running

        # 查询数据库-->数据库中的状态为调度或者进入队列状态
        resettable_states = [State.SCHEDULED, State.QUEUED]
        TI = models.TaskInstance
        DR = models.DagRun
        if filter_by_dag_run is None:
            # 查出数据库中正在运行的task
            resettable_tis = (
                session
                .query(TI)
                .join(
                    DR,
                    and_(
                        TI.dag_id == DR.dag_id,
                        TI.execution_date == DR.execution_date))
                .filter(
                    DR.state == State.RUNNING,
                    DR.external_trigger == False,
                    DR.run_id.notlike(BackfillJob.ID_PREFIX + '%'),
                    TI.state.in_(resettable_states))).all()
        else:
            # 由DagRun实例进行获取任务实例
            resettable_tis = filter_by_dag_run.get_task_instances(state=resettable_states,
                                                                  session=session)
        # 需要重的任务    数据库显示已经在运行或者进入队列,而实际上executor并未感知
        tis_to_reset = []
        # Can't use an update here since it doesn't support joins
        for ti in resettable_tis:
            # 既不在排队 也不在执行中
            if ti.key not in queued_tis and ti.key not in running_tis:
                tis_to_reset.append(ti)
        # 拼sql 查询
        filter_for_tis = ([and_(TI.dag_id == ti.dag_id,
                                TI.task_id == ti.task_id,
                                TI.execution_date == ti.execution_date)
                           for ti in tis_to_reset])
        if len(tis_to_reset) == 0:
            return []
        reset_tis = (
            session
            .query(TI)
            .filter(or_(*filter_for_tis), TI.state.in_(resettable_states))
            .with_for_update()
            .all())
        # 重置状态
        for ti in reset_tis:
            ti.state = State.NONE
            session.merge(ti)
        task_instance_str = '\n\t'.join(
            ["{}".format(x) for x in reset_tis])
        session.commit()

        self.log.info(
            "Reset the following %s TaskInstances:\n\t%s",
            len(reset_tis), task_instance_str
        )
        return reset_tis

你可能感兴趣的:(调度系统,python)