pg的backfill请求也是发送到osd的work queue中与业务IO一起竞争。
57 void PGRecovery::run(
58 OSD *osd,
59 OSDShard *sdata,
60 PGRef& pg,
61 ThreadPool::TPHandle &handle)
62 {
63 osd->do_recovery(pg.get(), epoch_queued, reserved_pushes, handle);
64 pg->unlock();
65 }
10018 void OSD::do_recovery(
10019 PG *pg, epoch_t queued, uint64_t reserved_pushes,
10020 ThreadPool::TPHandle &handle)
10021 {
...
10031 float recovery_sleep = get_osd_recovery_sleep();
...
10070 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
...
10075 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
10076 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
10077 << " on " << *pg << dendl;
...
10089 service.release_reserved_pushes(reserved_pushes);
10090 }
12452 bool PrimaryLogPG::start_recovery_ops(
12453 uint64_t max,
12454 ThreadPool::TPHandle &handle,
12455 uint64_t *ops_started)
12456 {
...
12461 ceph_assert(is_primary());
...
12465 ceph_assert(recovery_queued);
12466 recovery_queued = false; // recovery op已经得到,pg的下一轮恢复需要重新入队检查
...
12502 if (recovering.empty() &&
12503 state_test(PG_STATE_BACKFILLING) &&
12504 !backfill_targets.empty() && started < max &&
12505 missing.num_missing() == 0 &&
12506 waiting_on_backfill.empty()) {
12507 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
12508 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
12509 deferred_backfill = true;
12510 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
12511 !is_degraded()) {
12512 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
12513 deferred_backfill = true;
12514 } else if (!backfill_reserved) {
12515 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
12516 if (!backfill_reserving) {
12517 dout(10) << "queueing RequestBackfill" << dendl;
12518 backfill_reserving = true;
12519 queue_peering_event(
12520 PGPeeringEventRef(
12521 std::make_shared(
12522 get_osdmap_epoch(),
12523 get_osdmap_epoch(),
12524 RequestBackfill())));
12525 }
12526 deferred_backfill = true;
12527 } else {
12528 started += recover_backfill(max - started, handle, &work_in_progress);
12530 }
12531
12532 dout(10) << " started " << started << dendl;
Backfill targets是要回填的目标pg shard,例如:
[1,28,120,278,90,17,210,322,184,186]/[1,121,192,278,106,17,60,322,31,53]为一个pg当前的up set和acting set。由于扩容导致up set与acting set不一致,需要backfill,backfill target为:
[28(1),90(4),120(2),184(8),186(9),210(6)]
有了check obj之后,backfill target的pg shard都要向check obj看齐:
13049 uint64_t PrimaryLogPG::recover_backfill(
13050 uint64_t max,
13051 ThreadPool::TPHandle &handle, bool *work_started)
13052 {
...
13061 if (new_backfill) { // pg第一次执行backfill,需要把peer_backfill_info、backfill_info、backfills_in_flight重置
13062 // on_activate() was called prior to getting here
13063 ceph_assert(last_backfill_started == earliest_backfill());
13064 new_backfill = false;
13065
13066 // initialize BackfillIntervals
13067 for (set::iterator i = backfill_targets.begin();
13068 i != backfill_targets.end();
13069 ++i) {
13070 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
13071 }
13072 backfill_info.reset(last_backfill_started);
13073
13074 backfills_in_flight.clear();
13075 pending_backfill_updates.clear();
13076 }
13077
...
13089 // update our local interval to cope with recent changes
13090 backfill_info.begin = last_backfill_started; // 更新begin对象为last_backfill_started
13091 update_range(&backfill_info, handle);
13092
13093 unsigned ops = 0;
13094 vector > to_remove; // 放置要删除的obj
13095 set add_to_stat;
13096
13097 for (set::iterator i = backfill_targets.begin(); // 遍历backfill targets,更新peer_backfill_info的begin对象
13098 i != backfill_targets.end();
13099 ++i) {
13100 peer_backfill_info[*i].trim_to(
13101 std::max(peer_info[*i].last_backfill, last_backfill_started));
13102 }
13103 backfill_info.trim_to(last_backfill_started);
13104
13105 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13106 while (ops < max) {
...
13116 dout(20) << " my backfill interval " << backfill_info << dendl;
...
13148 if (backfill_info.empty() && all_peer_done()) {
13149 dout(10) << " reached end for both local and all peers" << dendl;
13150 break;
13151 }
13152
13153 // Get object within set of peers to operate on and
13154 // the set of targets for which that object applies.
13155 hobject_t check = earliest_peer_backfill();
13156
13157 if (check < backfill_info.begin) {
...
13189 } else {
13190 eversion_t& obj_v = backfill_info.objects.begin()->second;
13191
13192 vector need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
13193 for (set::iterator i = backfill_targets.begin();
13194 i != backfill_targets.end();
13195 ++i) {
13196 pg_shard_t bt = *i;
13197 BackfillInterval& pbi = peer_backfill_info[bt];
13198 // Find all check peers that have the wrong version
13199 if (check == backfill_info.begin && check == pbi.begin) {
13200 if (pbi.objects.begin()->second != obj_v) {
13201 need_ver_targs.push_back(bt);
13202 } else {
13203 keep_ver_targs.push_back(bt);
13204 }
13205 } else {
13206 pg_info_t& pinfo = peer_info[bt];
13207
13208 // Only include peers that we've caught up to their backfill line
13209 // otherwise, they only appear to be missing this object
13210 // because their pbi.begin > backfill_info.begin.
13211 if (backfill_info.begin > pinfo.last_backfill)
13212 missing_targs.push_back(bt);
13213 else
13214 skip_targs.push_back(bt);
13215 }
13216 }
...
13225 if (!need_ver_targs.empty() || !missing_targs.empty()) {
13227 ceph_assert(obc);
13228 if (obc->get_recovery_read()) {
13229 if (!need_ver_targs.empty()) {
13230 dout(20) << " BACKFILL replacing " << check
13231 << " with ver " << obj_v
13232 << " to peers " << need_ver_targs << dendl;
13233 }
13234 if (!missing_targs.empty()) {
13235 dout(20) << " BACKFILL pushing " << backfill_info.begin
13236 << " with ver " << obj_v
13237 << " to peers " << missing_targs << dendl;
13238 }
13239 vector all_push = need_ver_targs;
13240 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end()); // 记录所有要推送的target shard
13241
13242 handle.reset_tp_timeout(); // 防止线程心跳超时
13243 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
...
13249 ops++;
...
13276 }
...
13315 pgbackend->run_recovery_op(h, get_recovery_op_priority());
715 void ECBackend::run_recovery_op(
716 RecoveryHandle *_h,
717 int priority)
718 {
719 ECRecoveryHandle *h = static_cast(_h);
720 RecoveryMessages m;
721 for (list::iterator i = h->ops.begin();
722 i != h->ops.end();
723 ++i) {
724 dout(10) << __func__ << ": starting " << *i << dendl;
725 ceph_assert(!recovery_ops.count(i->hoid));
726 RecoveryOp &op = recovery_ops.insert(make_pair(i->hoid, *i)).first->second;
727 continue_recovery_op(op, &m);
728 }
729
730 dispatch_recovery_messages(m, priority);
731 send_recovery_deletes(priority, h->deletes);
732 delete _h;
733 }
恢复过程表示为一个状态机:
567 void ECBackend::continue_recovery_op(
568 RecoveryOp &op,
569 RecoveryMessages *m)
570 {
571 dout(10) << __func__ << ": continuing " << op << dendl;
572 while (1) {
573 switch (op.state) {
574 case RecoveryOp::IDLE: {
575 // start read
576 op.state = RecoveryOp::READING;
577 ceph_assert(!op.recovery_progress.data_complete);
578 set want(op.missing_on_shards.begin(), op.missing_on_shards.end());
579 uint64_t from = op.recovery_progress.data_recovered_to;
580 uint64_t amount = get_recovery_chunk_size();
581
582 if (op.recovery_progress.first && op.obc) {
583 /* We've got the attrs and the hinfo, might as well use them */
584 op.hinfo = get_hash_info(op.hoid);
585 ceph_assert(op.hinfo);
586 op.xattrs = op.obc->attr_cache;
587 encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]);
588 }
589
590 map>> to_read;
591 int r = get_min_avail_to_read_shards(
592 op.hoid, want, true, false, &to_read);
593 if (r != 0) {
594 // we must have lost a recovery source
595 ceph_assert(!op.recovery_progress.first);
596 dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid
597 << dendl;
598 get_parent()->cancel_pull(op.hoid);
599 recovery_ops.erase(op.hoid);
600 return;
601 }
602 m->read(
603 this,
604 op.hoid,
605 op.recovery_progress.data_recovered_to,
606 amount,
607 std::move(want),
608 to_read,
609 op.recovery_progress.first && !op.obc);
610 op.extent_requested = make_pair(
611 from,
612 amount);
613 dout(10) << __func__ << ": IDLE return " << op << dendl;
614 return;
615 }
后面就是等待读取请求回复,收到所有的回复之后,发起push。收到所有的pushreply之后说明该obj完成backfill。以此重复,完成pg所有需要backfill的obj。