本文主要介绍on_applied、on_commit、on_applied_sync、on_all_commit、on_all_applied在数据IO处理流程中的回调代码梳理。写的比较简单,不过关键点都已经整理。以filestore为例:
OSD端详细程分析:https://blog.51cto.com/wendashuai/2497104
主端:
PrimaryLogPG::execute_ctx()->issue_repop(repop, ctx)->pgbackend->submit_transaction()->issue_op(); parent->queue_transactions()
1.
void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
{
Context *onapplied_sync = new C_OSD_OndiskWriteUnlock()
Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
pgbackend->submit_transaction(//-> ReplicatedBackend::submit_transaction(...,on_local_applied_sync,on_all_acked,on_all_commit,...)
soid,
ctx->delta_stats,
ctx->at_version,
std::move(ctx->op_t),
pg_trim_to,
min_last_complete_ondisk,
ctx->log,
ctx->updated_hset_history,
onapplied_sync,
on_all_applied,
on_all_commit,
repop->rep_tid,
ctx->reqid,
ctx->op);
}
2.
void ReplicatedBackend::submit_transaction(
const hobject_t &soid,
const object_stat_sum_t &delta_stats,
const eversion_t &at_version,
PGTransactionUPtr &&_t,
const eversion_t &trim_to,
const eversion_t &roll_forward_to,
const vector &_log_entries,
boost::optional &hset_history,
Context *on_local_applied_sync,
Context *on_all_acked,
Context *on_all_commit,
ceph_tid_t tid,
osd_reqid_t reqid,
OpRequestRef orig_op)
{
InProgressOp &op = in_progress_ops.insert(
make_pair(
tid,
InProgressOp(
tid, on_all_commit, on_all_acked,
orig_op, at_version)
)
).first->second;
op.waiting_for_applied.insert(
parent->get_actingbackfill_shards().begin(),
parent->get_actingbackfill_shards().end());
op.waiting_for_commit.insert(
parent->get_actingbackfill_shards().begin(),
parent->get_actingbackfill_shards().end());
//issue_op将ops的信息封装成message发送给replica osd副本的。这个操作就是在封装message,这里就不再多说了
issue_op(
soid,
at_version,
tid,
reqid,
trim_to,
at_version,
added.size() ? *(added.begin()) : hobject_t(),
removed.size() ? *(removed.begin()) : hobject_t(),
log_entries,
hset_history,
&op,
op_t);
op_t.register_on_applied_sync(on_local_applied_sync); --->on_applied_sync
op_t.register_on_applied( --->on_applied
parent->bless_context(
new C_OSD_OnOpApplied(this, &op)));
op_t.register_on_commit( --->on_commit
parent->bless_context(
new C_OSD_OnOpCommit(this, &op)));
parent->queue_transactions(tls, op.op);//int FileStore::queue_transactions()
}
3.
void ReplicatedBackend::issue_op(
const hobject_t &soid,
const eversion_t &at_version,
ceph_tid_t tid,
osd_reqid_t reqid,
eversion_t pg_trim_to,
eversion_t pg_roll_forward_to,
hobject_t new_temp_oid,
hobject_t discard_temp_oid,
const vector &log_entries,
boost::optional &hset_hist,
InProgressOp *op,
ObjectStore::Transaction &op_t)
{
get_parent()->send_message_osd_cluster(peer.osd, wr, get_osdmap()->get_epoch());//go
}
4.
副本端:
ReplicatedBackend::handle_message()--->sub_op_modify(op)--->queue_transactions()
// sub op modify 当pg的从副本接收到MSG_OSD_REPOP,调用该函数,完成本地对象的数据写入
void ReplicatedBackend::sub_op_modify(OpRequestRef op)
{
rm->opt.register_on_commit(
parent->bless_context(
new C_OSD_RepModifyCommit(this, rm)));
rm->localt.register_on_applied(
parent->bless_context(
new C_OSD_RepModifyApply(this, rm)));
parent->queue_transactions(tls, op);// ->int FileStore::queue_transactions()
}
5.
主回调:
class C_OSD_OnOpCommit : public Context {
ReplicatedBackend *pg;
ReplicatedBackend::InProgressOp *op;
public:
C_OSD_OnOpCommit(ReplicatedBackend *pg, ReplicatedBackend::InProgressOp *op)
: pg(pg), op(op) {}
void finish(int) override {
pg->op_commit(op);
}
};
class C_OSD_OnOpApplied : public Context {
ReplicatedBackend *pg;
ReplicatedBackend::InProgressOp *op;
public:
C_OSD_OnOpApplied(ReplicatedBackend *pg, ReplicatedBackend::InProgressOp *op)
: pg(pg), op(op) {}
void finish(int) override {
pg->op_applied(op);
}
};
6.
副本回调:
struct ReplicatedBackend::C_OSD_RepModifyApply : public Context {
ReplicatedBackend *pg;
RepModifyRef rm;
C_OSD_RepModifyApply(ReplicatedBackend *pg, RepModifyRef r)
: pg(pg), rm(r) {}
void finish(int r) override {
pg->repop_applied(rm);
}
};
struct ReplicatedBackend::C_OSD_RepModifyCommit : public Context {
ReplicatedBackend *pg;
RepModifyRef rm;
C_OSD_RepModifyCommit(ReplicatedBackend *pg, RepModifyRef r)
: pg(pg), rm(r) {}
void finish(int r) override {
pg->repop_commit(rm);
}
};
7.filestore端
int FileStore::queue_transactions(Sequencer *posr, vector& tls,
TrackedOpRef osd_op,
ThreadPool::TPHandle *handle)
{
Context *onreadable;
Context *ondisk;
Context *onreadable_sync;
ObjectStore::Transaction::collect_contexts(
tls, &onreadable, &ondisk, &onreadable_sync);
}
static void collect_contexts(
vector& t,
Context **out_on_applied,
Context **out_on_commit,
Context **out_on_applied_sync)
{
list on_applied, on_commit, on_applied_sync;
for (vector::iterator i = t.begin();i != t.end();++i)
{
on_applied.splice(on_applied.end(), (*i).on_applied);
on_commit.splice(on_commit.end(), (*i).on_commit);
on_applied_sync.splice(on_applied_sync.end(), (*i).on_applied_sync);
}
*out_on_applied = C_Contexts::list_to_context(on_applied);
*out_on_commit = C_Contexts::list_to_context(on_commit);
*out_on_applied_sync = C_Contexts::list_to_context(on_applied_sync);
}
8.写journal和data回调关系
_op_journal_transactions(tbl, orig_len, o->op,new C_JournaledAhead(this, osr, o, ondisk),osd_op); ->onjournal ->oncommit
写完journal,回调ondisk,Finisher线程ondisk_finishers ---> *Finisher::finisher_thread_entry() --->complete() ---> finish()
if (ondisk) {
dout(10) << " queueing ondisk " << ondisk << dendl;
ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk);
}
写完data,回调onreadable,Finisher线程apply_finishers ---> *Finisher::finisher_thread_entry() --->complete() ---> finish()
if (o->onreadable) {//写完filestore后,数据开始可读。(此时可能写到page cache了)
apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable);
}
onreadable ---> out_on_applied ---on_applied
ondisk ---> out_on_commit --->on_commit
onreadable_sync --->out_on_applied_sync --->on_applied_sync
总结:
主完成了journal的写入: C_OSD_OnOpCommit pg->op_commit(op); ondisk ondisk_finishers; 此时可继续写?
主完成data写入: C_OSD_OnOpApplied pg->op_applied(op) onreadable apply_finishers; 此时写入的数据可读?
副本完成了journal的写入: C_OSD_RepModifyCommit pg->repop_commit(rm) send_message_osd_cluster发送到主
副本完成data写入: C_OSD_RepModifyApply pg->repop_applied(rm) send_message_osd_cluster发送到主
完成所有副本journal写入:all_committed on_all_commit C_OSD_RepopCommit repop_all_committed waiting_for_ondisk //called when all commit
完成所有副本data写入: all_applied on_all_applied C_OSD_RepopApplied repop_all_applied waiting_for_ack //called when all acked
Context *on_all_commit = new C_OSD_RepopCommit(this, repop);//on_all_commit
Context *on_all_applied = new C_OSD_RepopApplied(this, repop);//on_all_acked
Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(xxx); //on_local_applied_sync 本地端sync完成
所有端完成了journal的写入后,此时数据已经写到journal盘,会处理waiting_for_ondisk list,ondisk状态。环境一旦崩溃,可以journal replay方式回放恢复;
所有端完成了data的写入后,即写入到了filestore层,此时代表apply成功,会处理waiting_for_ack list,向client端发送ack通知写完成,此时数据处于可读状态onreadable;