一,APScheduler简要介绍
APScheduler是一个简单的单节点定时任务框架。其支持定时任务,周期任务,以及支持类似cron的时间格式的任务。
它包括以下4种类型的组件:
1,trigger: trigger包含调度策略,每一个job都有自己的调度策略来决定其下次调度的时间。trigger支持指定周期,按指定的日期,按cron格式的任务。
2,job stores:任务的存储方式,默认是内存,可以支持数据库,redis,mongodb等。
3,executors:是最终执行任务的地方。executors将任务提交放到一个线程汇总线程池中执行。当任务结束的时候,会回调注册的回调监听器。
4,Schedulers:该组件的功能包括,配置用户使用的trigger,任务存储的方式(job stores),以及使用什么方式执行任务。
并且,调用trigger中计算的每个任务的下一次执行时间,调用executors进行任务的执行。
二,一个简单的例子
1,使用默认的配置
from apscheduler.schedulers.blocking import BlockingScheduler
import datetime
def aps_test():
print datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
scheduler = BlockingScheduler()
scheduler.add_job(func=aps_test, trigger='cron', second='*/5')
scheduler.start()
2,使用用户自己配置的方式
import datetime
from pytz import utc
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from apscheduler.executors.pool import ProcessPoolExecutor
jobstores = {
'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
}
executors = {
'default': {'type': 'threadpool', 'max_workers': 20},
'processpool': ProcessPoolExecutor(max_workers=5)
}
job_defaults = {
'coalesce': False,
'max_instances': 3
}
scheduler = BackgroundScheduler()
scheduler.configure(jobstores=jobstores, executors=executors, job_defaults=job_defaults, timezone=utc)
def aps_test():
print datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
scheduler.add_job(func=aps_test, trigger='cron', second='*/5')
scheduler.start()
三,ASPScheduler源码浅析(基于3.3.0版本,SQLAlchemy后端存储)
1,scheduler.add_job
add_job(func, trigger=None, args=None, kwargs=None, id=None, \
name=None, misfire_grace_time=undefined, coalesce=undefined, \
max_instances=undefined, next_run_time=undefined, \
jobstore='default', executor='default', \
replace_existing=False, **trigger_args)
“”“
misfire_grace_time:超过用户设定的时间范围外,该任务依旧执行的时间(单位时间s)。
比如用户设置misfire_grace_time=60,于3:00触发任务。
由于某种原因在3:00没有触发,被延时了。如果时间在3:01内,该任务仍能触发,超过3:01任务不执行。
coalesce:累计的 任务是否执行。True不执行,False,执行。
同上,由于某种原因,比如进场挂了,导致任务多次没有调用,则前几次的累计任务的任务是否执行的策略。
max_instances:同一个任务在线程池中最多跑的实例数。
”“”
#转化该任务的一些执行参数
job_kwargs = {
'trigger': self._create_trigger(trigger, trigger_args),
'executor': executor,
'func': func,
'args': tuple(args) if args is not None else (),
'kwargs': dict(kwargs) if kwargs is not None else {},
'id': id,
'name': name,
'misfire_grace_time': misfire_grace_time,
'coalesce': coalesce,
'max_instances': max_instances,
'next_run_time': next_run_time
}
job_kwargs = dict((key, value) for key, value in six.iteritems(job_kwargs) if
value is not undefined)
job = Job(self, **job_kwargs)
# Don't really add jobs to job stores before the scheduler is up and running
with self._jobstores_lock:
if self.state == STATE_STOPPED:
self._pending_jobs.append((job, jobstore, replace_existing))
self._logger.info('Adding job tentatively -- it will be properly scheduled when '
'the scheduler starts')
else:
self._real_add_job(job, jobstore, replace_existing)
return job
def _real_add_job(self, job, jobstore_alias, replace_existing):
"""
将任务加入到指定的存储后端中
"""
# Fill in undefined values with defaults
replacements = {}
# self._job_defaults保存的数据如下
#self._job_defaults = {
# 'misfire_grace_time': asint(job_defaults.get('misfire_grace_time', 1)),
# 'coalesce': asbool(job_defaults.get('coalesce', True)),
# 'max_instances': asint(job_defaults.get('max_instances', 1))
#}
for key, value in six.iteritems(self._job_defaults):
if not hasattr(job, key):
replacements[key] = value
# 用户如果没有定义self._job_defaults的属性,则用系统默认的属性
if not hasattr(job, 'next_run_time'):
now = datetime.now(self.timezone)
#初始任务是没有next_run_time的,这里是计算该任务下次执行的时间:get_next_fire_time
replacements['next_run_time'] = job.trigger.get_next_fire_time(None, now)
job._modify(**replacements)
#查询当前任务的后端存储方式, jobstore_alias包括 default,memory,sqlalchemy,mongodb,rethinkdb,redis,zookeeper
#这个可以在其源码中的setup.py中查询:
#'apscheduler.jobstores': [
# 'memory = apscheduler.jobstores.memory:MemoryJobStore',
# 'sqlalchemy = apscheduler.jobstores.sqlalchemy:SQLAlchemyJobStore [sqlalchemy]',
# 'mongodb = apscheduler.jobstores.mongodb:MongoDBJobStore [mongodb]',
# 'rethinkdb = apscheduler.jobstores.rethinkdb:RethinkDBJobStore [rethinkdb]',
# 'redis = apscheduler.jobstores.redis:RedisJobStore [redis]',
# 'zookeeper = apscheduler.jobstores.zookeeper:ZookeeperJobStore [zookeeper]'
#]
store = self._lookup_jobstore(jobstore_alias)
try:
#当前使用sqlalchemy的后端,会将该任务保存到数据库中。
#其数据库中jobs表的字段为为:id, next_run_time, job_state
#其中job_state是job对象pickle序列化后的二进制
#next_run_time是任务的下一次执行时间
store.add_job(job)
except ConflictingIdError:
if replace_existing:
store.update_job(job)
else:
raise
job._jobstore_alias = jobstore_alias
#如果用户注册了监听器,每个人的执行状态变更都会通知相关的监听器
#并且监听器可以设置自己关心任务执行的哪些状态,不指定则全部状态变更都会通知
event = JobEvent(EVENT_JOB_ADDED, job.id, jobstore_alias)
self._dispatch_event(event)
# Notify the scheduler about the new job
if self.state == STATE_RUNNING:
self.wakeup()
2,scheduler.start:_main_loop()
#start函数中进行任务调度的核心方法为 _main_loop()。
#使用线程锁,一个是wait()超市后唤醒,或者有人主动notify()
#防止频繁的计算任务的执行时间
def _main_loop(self):
wait_seconds = TIMEOUT_MAX
while self.state != STATE_STOPPED:
self._event.wait(wait_seconds)
self._event.clear()
#_process_jobs 返回的是下一个任务的执行时间
wait_seconds = self._process_jobs()
3,_process_jobs
def _process_jobs(self):
if self.state == STATE_PAUSED:
self._logger.debug('Scheduler is paused -- not processing jobs')
return None
self._logger.debug('Looking for jobs to run')
now = datetime.now(self.timezone)
next_wakeup_time = None
events = []
with self._jobstores_lock:
for jobstore_alias, jobstore in six.iteritems(self._jobstores):
try:
#获取当前时间点now需要执行的任务
#sqlarmey中就是获取next_run_time小于now的所有任务
due_jobs = jobstore.get_due_jobs(now)
except Exception as e:
# Schedule a wakeup at least in jobstore_retry_interval seconds
self._logger.warning('Error getting due jobs from job store %r: %s',
jobstore_alias, e)
retry_wakeup_time = now + timedelta(seconds=self.jobstore_retry_interval)
if not next_wakeup_time or next_wakeup_time > retry_wakeup_time:
next_wakeup_time = retry_wakeup_time
continue
for job in due_jobs:
# Look up the job's executor
try:
#获取任务的执行器
executor = self._lookup_executor(job.executor)
except:
self._logger.error(
'Executor lookup ("%s") failed for job "%s" -- removing it from the '
'job store', job.executor, job)
self.remove_job(job.id, jobstore_alias)
continue
#计算当前任务需要执行时间点,_get_run_times获取到的是一个时间列表
#主要是由于各种原因,比如进程故障重启,导致任务在执行点没有执行而遗留
run_times = job._get_run_times(now)
#coalesce=True 不执行历史堆积的任务,只执行最近的一次
#coalesce=False 执行历史堆积,全都执行
run_times = run_times[-1:] if run_times and job.coalesce else run_times
if run_times:
try:
executor.submit_job(job, run_times)
except MaxInstancesReachedError:
self._logger.warning(
'Execution of job "%s" skipped: maximum number of running '
'instances reached (%d)', job, job.max_instances)
event = JobSubmissionEvent(EVENT_JOB_MAX_INSTANCES, job.id,
jobstore_alias, run_times)
events.append(event)
except:
self._logger.exception('Error submitting job "%s" to executor "%s"',
job, job.executor)
else:
event = JobSubmissionEvent(EVENT_JOB_SUBMITTED, job.id, jobstore_alias,
run_times)
events.append(event)
#计算并跟新该任务的下一个执行时间
job_next_run = job.trigger.get_next_fire_time(run_times[-1], now)
if job_next_run:
job._modify(next_run_time=job_next_run)
jobstore.update_job(job)
else:
self.remove_job(job.id, jobstore_alias)
# 计算数据库中数据的下一个执行时间
# 就是获取数据库中next_run_time最小的那个值
jobstore_next_run_time = jobstore.get_next_run_time()
if jobstore_next_run_time and (next_wakeup_time is None or
jobstore_next_run_time < next_wakeup_time):
next_wakeup_time = jobstore_next_run_time.astimezone(self.timezone)
# Dispatch collected events
for event in events:
self._dispatch_event(event)
# Determine the delay until this method should be called again
if self.state == STATE_PAUSED:
wait_seconds = None
self._logger.debug('Scheduler is paused; waiting until resume() is called')
elif next_wakeup_time is None:
wait_seconds = None
self._logger.debug('No jobs; waiting until a job is added')
else:
wait_seconds = max(timedelta_seconds(next_wakeup_time - now), 0)
self._logger.debug('Next wakeup is due at %s (in %f seconds)', next_wakeup_time,
wait_seconds)
#返回下次任务的唤醒时间
return wait_seconds
四,参考文档:
http://apscheduler.readthedocs.io/en/3.0/userguide.html
http://www.cnblogs.com/quijote/p/4385774.html
APScheduler扩展:
http://blog.csdn.net/chosen0ne/article/details/7925979
源码:
https://github.com/agronholm/apscheduler/tree/v3.3.0