Ceph Crush算法是Ceph分布式系统中用于数据分布(定位)的核心算法,其核心组件有crush rule、bucket algorithm。crush rule是可以自定义的选择过程,bucket algorithm是从bucket选取item时使用的算法,该算法需要的主要参数有:placement seed(pgid)、crush map、副本数等。本文将简要介绍Ceph Crush算法的实现。
{
//devices
"devices": [
{
"id": 0,
"name": "osd.0"
},
{
"id": 1,
"name": "osd.1"
},
...
{
"id": 9,
"name": "osd.9"
}
],
//type
"types": [
{
"type_id": 0,
"name": "osd"
},
{
"type_id": 1,
"name": "host"
},
...
{
"type_id": 10,
"name": "root"
}
],
//buckets
"buckets": [
{
"id": -1,
"name": "default",
"type_id": 10,
"type_name": "root",
"weight": 821160,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": -2,
"weight": 142868,
"pos": 0
},
{
"id": -3,
"weight": 142868,
"pos": 1
},
{
"id": -8,
"weight": 178910,
"pos": 2
},
{
"id": -10,
"weight": 356514,
"pos": 3
}
]
},
{
"id": -2,
"name": "ceph-osd-240",
"type_id": 1,
"type_name": "host",
"weight": 142868,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 0,
"weight": 142868,
"pos": 0
}
]
},
{
"id": -3,
"name": "ceph-osd-241",
"type_id": 1,
"type_name": "host",
"weight": 142868,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 1,
"weight": 142868,
"pos": 0
}
]
},
{
"id": -8,
"name": "ceph-osd-66",
"type_id": 1,
"type_name": "host",
"weight": 178910,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 2,
"weight": 36044,
"pos": 0
},
{
"id": 3,
"weight": 32768,
"pos": 1
},
{
"id": 4,
"weight": 3276,
"pos": 2
},
{
"id": 5,
"weight": 34078,
"pos": 3
},
{
"id": 6,
"weight": 36044,
"pos": 4
},
{
"id": 7,
"weight": 36700,
"pos": 5
}
]
},
{
"id": -10,
"name": "ceph-osd-253",
"type_id": 1,
"type_name": "host",
"weight": 356514,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 8,
"weight": 178257,
"pos": 0
},
{
"id": 9,
"weight": 178257,
"pos": 1
}
]
}
],
//crush rule
"rules": [
{
"rule_id": 0,
"rule_name": "replicated_ruleset",
"ruleset": 0,
"type": 1,
"min_size": 1,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "chooseleaf_firstn",
"num": 0,
"type": "host"
},
{
"op": "emit"
}
]
}
],
//相关的可调控的配置参数
"tunables": {
"choose_local_tries": 0,
"choose_local_fallback_tries": 0,
"choose_total_tries": 50,
"chooseleaf_descend_once": 1,
"chooseleaf_vary_r": 0,
"straw_calc_version": 1,
"allowed_bucket_algs": 22,
"profile": "unknown",
"optimal_tunables": 0,
"legacy_tunables": 0,
"require_feature_tunables": 1,
"require_feature_tunables2": 1,
"require_feature_tunables3": 0,
"has_v2_rules": 0,
"has_v3_rules": 0,
"has_v4_buckets": 0
}
}
crush rule中的step op codes
/* step op codes */
enum {
CRUSH_RULE_NOOP = 0,
CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
¦ ¦ /* arg2 = type */
CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
CRUSH_RULE_EMIT = 4, /* no args */
CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
CRUSH_RULE_CHOOSELEAF_INDEP = 7,
CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12
};
crush_map结构体
/*
* CRUSH map includes all buckets, rules, etc.
*/
struct crush_map {
struct crush_bucket **buckets;
struct crush_rule **rules;
__s32 max_buckets;
__u32 max_rules;
__s32 max_devices;
/* choose local retries before re-descent */
__u32 choose_local_tries;
/* choose local attempts using a fallback permutation before
¦* re-descent */
__u32 choose_local_fallback_tries;
/* choose attempts before giving up */
__u32 choose_total_tries;
/* attempt chooseleaf inner descent once for firstn mode; on
¦* reject retry outer descent. Note that this does *not*
¦* apply to a collision: in that case we will retry as we used
¦* to. */
__u32 chooseleaf_descend_once;
/* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
¦* bits. a value of 1 is best for new clusters. for legacy clusters
¦* that want to limit reshuffling, a value of 3 or 4 will make the
¦* mappings line up a bit better with previous mappings. */
__u8 chooseleaf_vary_r;
/*
¦* version 0 (original) of straw_calc has various flaws. version 1
¦* fixes a few of them.
¦*/
__u8 straw_calc_version;
/*
¦* allowed bucket algs is a bitmask, here the bit positions
¦* are CRUSH_BUCKET_*. note that these are *bits* and
¦* CRUSH_BUCKET_* values are not, so we need to or together (1
¦* << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
¦* minimize confusion (bucket type values start at 1).
¦*/
__u32 allowed_bucket_algs;
__u32 *choose_tries;
};
/*
* CRUSH uses user-defined "rules" to describe how inputs should be
* mapped to devices. A rule consists of sequence of steps to perform
* to generate the set of output devices.
*/
struct crush_rule_step {
__u32 op;
__s32 arg1;
__s32 arg2;
};
/*
* The rule mask is used to describe what the rule is intended for.
* Given a ruleset and size of output set, we search through the
* rule list for a matching rule_mask.
*/
struct crush_rule_mask {
__u8 ruleset;
__u8 type;
__u8 min_size;
__u8 max_size;
};
crush_bucket结构体:
struct crush_bucket {
__s32 id; /* this'll be negative */
__u16 type; /* non-zero; type=0 is reserved for devices */
__u8 alg; /* one of CRUSH_BUCKET_* */
__u8 hash; /* which hash function to use, CRUSH_HASH_* */
__u32 weight; /* 16-bit fixed point */
__u32 size; /* num items */
__s32 *items;
/*
¦* cached random permutation: used for uniform bucket and for
¦* the linear search fallback for the other bucket types.
¦*/
__u32 perm_x; /* @x for which *perm is defined */
__u32 perm_n; /* num elements of *perm that are permuted/defined */
__u32 *perm;
};
//crush_rule的结构体,表示pg映射的策略
struct crush_rule {
__u32 len;
struct crush_rule_mask mask;
struct crush_rule_step steps[0];
};
struct crush_bucket_straw {
struct crush_bucket h;
__u32 *item_weights; /* 16-bit fixed point */
__u32 *straws; /* 16-bit fixed point */
};
/*
* map raw pg (full precision ps) into a placement seed. include
* pool id in that value so that different pools don't use the same
* seeds.
*/
ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
{
if (flags & FLAG_HASHPSPOOL) {
// Hash the pool id so that pool PGs do not overlap.
return
crush_hash32_2(CRUSH_HASH_RJENKINS1,
ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
pg.pool());
} else {
// Legacy behavior; add ps and pool together. This is not a great
// idea because the PGs from each pool will essentially overlap on
// top of each other: 0.5 == 1.4 == 2.3 == ...
return
ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
pg.pool();
}
}
//将PG映射到一组OSDS
int OSDMap::_pg_to_osds(const pg_pool_t& pool, pg_t pg,
vector<int> *osds, int *primary,
ps_t *ppps) const
{
// map to osds[]
ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
//获取pool的replicated size
unsigned size = pool.get_size();
// what crush rule? 获取pool使用crush rule
int ruleno = crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), size);
if (ruleno >= 0)
crush->do_rule(ruleno, pps, *osds, size, osd_weight);
//删除不存在的osd
_remove_nonexistent_osds(pool, *osds);
*primary = -1;
//选取primary osd(第一个作为primary osd)
for (unsigned i = 0; i < osds->size(); ++i) {
if ((*osds)[i] != CRUSH_ITEM_NONE) {
*primary = (*osds)[i];
break;
}
}
if (ppps)
*ppps = pps;
return osds->size();
}
void do_rule(int rule, int x, vector<int>& out, int maxout,
¦ ¦ ¦ ¦const vector<__u32>& weight) const {
¦ Mutex::Locker l(mapper_lock);
¦ int rawout[maxout];
¦ int scratch[maxout * 3];
//开始crush过程:
//crush: crush map; rule:ruleset;x:placement seed; maxout:副本数;rawout:存放结果的数据
¦ int numrep = crush_do_rule(crush, rule, x, rawout, maxout, &weight[0], weight.size(), scratch);
¦ if (numrep < 0)
¦ ¦ numrep = 0;
¦ out.resize(numrep);
¦ for (int i=0; i
/**
* crush_do_rule - calculate a mapping with the given input and rule
* @map: the crush_map //crush map 包含了device、type、buckets、rules等。
* @ruleno: the rule id //当前pool所使用的rule规则ruleset
* @x: hash input //placement seed
* @result: pointer to result vector //用于存放选中的osd。
* @result_max: maximum result size //需要选择的osd个数
* @weight: weight vector (for map leaves)
* @weight_max: size of weight vector
* @scratch: scratch vector for private use; must be >= 3 * result_max
*/
int crush_do_rule(const struct crush_map *map,
¦ int ruleno, int x, int *result, int result_max,
¦ const __u32 *weight, int weight_max,
¦ int *scratch)
{
int result_len;
int *a = scratch;
int *b = scratch + result_max;
int *c = scratch + result_max*2;
int recurse_to_leaf;
int *w;
int wsize = 0;
int *o;
int osize;
int *tmp;
struct crush_rule *rule;
__u32 step;
int i, j;
int numrep;
int out_size;
/*
¦* the original choose_total_tries value was off by one (it
¦* counted "retries" and not "tries"). add one.
¦*/
int choose_tries = map->choose_total_tries + 1;
int choose_leaf_tries = 0;
/*
¦* the local tries values were counted as "retries", though,
¦* and need no adjustment
¦*/
int choose_local_retries = map->choose_local_tries;
int choose_local_fallback_retries = map->choose_local_fallback_tries;
int vary_r = map->chooseleaf_vary_r;
if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno);
return 0;
}
//选择当前pool使用的rule
rule = map->rules[ruleno];
result_len = 0;
w = a;
o = b;
for (step = 0; step < rule->len; step++) {
int firstn = 0;
struct crush_rule_step *curstep = &rule->steps[step];
switch (curstep->op) {
case CRUSH_RULE_TAKE:
//选择的是device或者是bucket,注:bucket的id使用负值
if ((curstep->arg1 >= 0 &&
curstep->arg1 < map->max_devices) ||
(-1-curstep->arg1 >= 0 &&
-1-curstep->arg1 < map->max_buckets &&
map->buckets[-1-curstep->arg1])) {
w[0] = curstep->arg1;
wsize = 1;
} else {
dprintk(" bad take value %d\n", curstep->arg1);
}
break;
case CRUSH_RULE_SET_CHOOSE_TRIES:
if (curstep->arg1 > 0)
choose_tries = curstep->arg1;
break;
case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
if (curstep->arg1 > 0)
choose_leaf_tries = curstep->arg1;
break;
case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
if (curstep->arg1 >= 0)
choose_local_retries = curstep->arg1;
break;
case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
if (curstep->arg1 >= 0)
choose_local_fallback_retries = curstep->arg1;
break;
case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
if (curstep->arg1 >= 0)
vary_r = curstep->arg1;
break;
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN:
firstn = 1;
/* fall through */
case CRUSH_RULE_CHOOSELEAF_INDEP:
case CRUSH_RULE_CHOOSE_INDEP:
if (wsize == 0)
break;
//决定是否递归的选择item(要求最终选择的item的类型为item(device))
recurse_to_leaf =
curstep->op ==
CRUSH_RULE_CHOOSELEAF_FIRSTN ||
curstep->op ==
CRUSH_RULE_CHOOSELEAF_INDEP;
/* reset output */
osize = 0;
for (i = 0; i < wsize; i++) {
int bno;
/*
* see CRUSH_N, CRUSH_N_MINUS macros.
* basically, numrep <= 0 means relative to
* the provided result_max
*/
//该step选择的item(buckets/devices)数;
//如果指定的数是大于零的数,则选择指定的item数,否则选择(numrep += result_max)(不小于0)个item
numrep = curstep->arg1;
if (numrep <= 0) {
numrep += result_max;
if (numrep <= 0)
continue;
}
j = 0;
/* make sure bucket id is valid */
bno = -1 - w[i];
if (bno < 0 || bno >= map->max_buckets) {
// w[i] is probably CRUSH_ITEM_NONE
dprintk(" bad w[i] %d\n", w[i]);
continue;
}
if (firstn) {
//recurse_tries 递归选择leaf item的次数。(貌似该变量没有真正使用)
int recurse_tries;
if (choose_leaf_tries)
recurse_tries =
choose_leaf_tries;
else if (map->chooseleaf_descend_once)
recurse_tries = 1;
else
recurse_tries = choose_tries;
//在某bucket下选择指定的数量的item(buckets/devices)
osize += crush_choose_firstn(
map,
map->buckets[bno],
weight, weight_max,
x, numrep,
curstep->arg2,
o+osize, j,
result_max-osize,
choose_tries,
recurse_tries,
choose_local_retries,
choose_local_fallback_retries,
recurse_to_leaf,
vary_r,
c+osize,
0);
} else {
out_size = ((numrep < (result_max-osize)) ?
¦ ¦ ¦ ¦ ¦ ¦ ¦ ¦ ¦ ¦ numrep : (result_max-osize));
crush_choose_indep(
map,
map->buckets[bno],
weight, weight_max,
x, out_size, numrep,
curstep->arg2,
o+osize, j,
choose_tries,
choose_leaf_tries ,
choose_leaf_tries : 1,
recurse_to_leaf,
c+osize,
0);
osize += out_size;
}
}
//如果recurse_to_leaf为true,则将递归选中的item放入o vector中。
if (recurse_to_leaf)
/* copy final _leaf_ values to output set */
memcpy(o, c, osize*sizeof(*o));
/* swap o and w arrays */ //把o中选中的结果,转交给w(w指向选择的结果)
tmp = o;
o = w;
w = tmp;
wsize = osize;
break;
//crush rule step的结束操作,将最终的结果都拷贝到result vector中
case CRUSH_RULE_EMIT:
for (i = 0; i < wsize && result_len < result_max; i++) {
result[result_len] = w[i];
result_len++;
}
wsize = 0;
break;
default:
dprintk(" unknown op %d at step %d\n",
curstep->op, step);
break;
}
}
return result_len;
}
就上文中crush map的实例中的rule规则结合代码实现过程,可以知道,首先第一步take,从default开始选择,其id为-4;然后进入第二步chooseleaf_firstn,相应的调用crush_choose_firstn函数,在default之下继续,该步选择bucket的类型为host,选择的item数为0(如果是0,则选择副本数个item,如果大于0,则选择指定个数的item,小于0则与副本数求和,其和作为item的个数,如果和也小于0则失败),并且recurse_to_leaf会被置为true,表示会递归的选择到osd device为止;最后rule 结束标志,将最终的结果保存到result vector中。 与chooseleaf_firstn非常相似的是choose_firstn,该step只会选择指定个数,指定类型的bucket/device。
注:scratch该参数被分成三部分(以副本数等分)用于不同的逻辑中,第一份用于存放step的最终结果,第二部分用于存放crush_choose_firstn的逻辑结果,第三部分用于crush_choose_firstn递归调用逻辑。
/**
* crush_choose_firstn - choose numrep distinct items of given type
* @map: the crush_map
* @bucket: the bucket we are choose an item from
* @x: crush input value
* @numrep: the number of items to choose
* @type: the type of item to choose
* @out: pointer to output vector
* @outpos: our position in that vector
* @out_size: size of the out vector
* @tries: number of attempts to make
* @recurse_tries: number of attempts to have recursive chooseleaf make
* @local_retries: localized retries
* @local_fallback_retries: localized fallback retries
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
* @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf) //需要递归选择osd type item时,会将选中的item放入该vector中
* @parent_r: r value passed from the parent
*/
static int crush_choose_firstn(const struct crush_map *map,
struct crush_bucket *bucket,
const __u32 *weight, int weight_max,
int x, int numrep, int type,
int *out, int outpos,
int out_size,
unsigned int tries,
unsigned int recurse_tries,
unsigned int local_retries,
unsigned int local_fallback_retries,
int recurse_to_leaf,
unsigned int vary_r,
int *out2,
int parent_r)
{
{
int rep;
unsigned int ftotal, flocal;
int retry_descent, retry_bucket, skip_rep;
struct crush_bucket *in = bucket;
int r;
int i;
int item = 0;
int itemtype;
int collide, reject;
int count = out_size;
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
recurse_to_leaf ? "_LEAF" : "",
bucket->id, x, outpos, numrep,
tries, recurse_tries, local_retries, local_fallback_retries,
parent_r);
//循环选取副本数个osds
for (rep = outpos; rep < numrep && count > 0 ; rep++) {
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0;
skip_rep = 0;
do {
retry_descent = 0;
in = bucket; /* initial bucket */
/* choose through intervening buckets */
flocal = 0;
do {
collide = 0;
retry_bucket = 0;
r = rep + parent_r;
/* r' = r + f_total */
r += ftotal;
/* bucket choose */
if (in->size == 0) {
reject = 1;
goto reject;
}
if (local_fallback_retries > 0 &&
flocal >= (in->size>>1) &&
flocal > local_fallback_retries)
item = bucket_perm_choose(in, x, r);
else
//在某bucket(in)中选择item,并返回该项
item = crush_bucket_choose(in, x, r);
//检查选中的项是否合法
if (item >= map->max_devices) {
dprintk(" bad item %d\n", item);
skip_rep = 1;
break;
}
/* desired type? */
if (item < 0)
//选中的item(bucket/device)的类型
itemtype = map->buckets[-1-item]->type;
else
itemtype = 0;
dprintk(" item %d type %d\n", item, itemtype);
/* keep going? */
//如果选中的item不是指定的类型,同时该item不是bucket,则选择失败,否则在该选中的bucket中继续选择
if (itemtype != type) {
if (item >= 0 ||
(-1-item) >= map->max_buckets) {
dprintk(" bad item type %d\n", type);
skip_rep = 1;
break;
}
in = map->buckets[-1-item];
retry_bucket = 1;
continue;
}
/* collision? */ //判断当前选择的item与之前选中的item是否重复(冲突)。
for (i = 0; i < outpos; i++) {
if (out[i] == item) {
collide = 1;
break;
}
}
reject = 0;
//如果当前选中的item,跟之前选择的不存在冲突,且该次step是choose leaf,则进入如下处理(递归调用crush_choose_firstn),否则跳过
if (!collide && recurse_to_leaf) {
//如果选中的是bucket者继续(递归)调用crush_choose_firstn
if (item < 0) {
int sub_r;
if (vary_r)
sub_r = r >> (vary_r-1);
else
sub_r = 0;
if (crush_choose_firstn(map,
map->buckets[-1-item],
weight, weight_max,
x, outpos+1, 0,
out2, outpos, count,
recurse_tries, 0,
local_retries,
local_fallback_retries,
0,
vary_r,
NULL,
sub_r) <= outpos)
/* didn't get leaf */
reject = 1;
} else {
/* we already have a leaf! */
out2[outpos] = item;
}
}
if (!reject) {
/* out? */
if (itemtype == 0)
//检查选择的osd tyep的item是否是out状态
reject = is_out(map, weight,
weight_max,
item, x);
else
reject = 0;
}
reject: //若没有选中合适的item则进入如下处理,
if (reject || collide) {
ftotal++;
flocal++;
if (collide && flocal <= local_retries)
/* retry locally a few times */
retry_bucket = 1;
else if (local_fallback_retries > 0 &&
flocal <= in->size + local_fallback_retries)
/* exhaustive bucket search */
retry_bucket = 1;
else if (ftotal < tries)
/* then retry descent */
retry_descent = 1;
else
/* else give up */
skip_rep = 1;
dprintk(" reject %d collide %d "
"ftotal %u flocal %u\n",
reject, collide, ftotal,
flocal);
}
} while (retry_bucket); //选中bucket,继续选择
} while (retry_descent);
if (skip_rep) {
dprintk("skip rep\n");
continue;
}
dprintk("CHOOSE got %d\n", item);
out[outpos] = item;
outpos++;
count--;
if (map->choose_tries && ftotal <= map->choose_total_tries)
map->choose_tries[ftotal]++;
}
dprintk("CHOOSE returns %d\n", outpos);
return outpos;
}
该函数简单的说就是调用crush_bucket_choose(…)函数从指定的bucket中选择合适的item,放入到out vector中,如果选中的item的类型不是期望的类型,且不是device,则基于当前的bucket继续调用crush_bucket_choose(…);如果当前的step是chooseleaf_firstn,则递归调用crush_choose_firstn(…),递归调用选中的osd将临时存放到out2 vector中,跳出递归后再复制给out。
注:recurse_tries参数表示递归尝试choose leaf的次数,貌似在该实现中没有使用。
static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
{
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
BUG_ON(in->size == 0);
switch (in->alg) {
case CRUSH_BUCKET_UNIFORM:
return bucket_uniform_choose((struct crush_bucket_uniform *)in,
x, r);
case CRUSH_BUCKET_LIST:
return bucket_list_choose((struct crush_bucket_list *)in,
x, r);
case CRUSH_BUCKET_TREE:
return bucket_tree_choose((struct crush_bucket_tree *)in,
x, r);
case CRUSH_BUCKET_STRAW:
return bucket_straw_choose((struct crush_bucket_straw *)in,
x, r);
case CRUSH_BUCKET_STRAW2:
return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
x, r);
default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0];
}
}
/* straw */
static int bucket_straw_choose(struct crush_bucket_straw *bucket,
int x, int r)
{
__u32 i;
int high = 0;
__u64 high_draw = 0;
__u64 draw;
//计算每个item的hash值,并与对应的straws[]值相乘,取积最大的那一项。
for (i = 0; i < bucket->h.size; i++) {
draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r)
draw &= 0xffff;
draw *= bucket->straws[i];
if (i == 0 || draw > high_draw) {
high = i;
high_draw = draw;
}
}
return bucket->h.items[high];
}
ceph crush中提供了多种bucket algorithm,不同的bucket algorithm适应不同的场景(这里不做介绍)其中bucket algorithm有:UNIFORM、LIST、TREE、STRAW、STRAW2;上文以STRAW为例,基本过程是求出item的哈希值,然后与相应的straw相乘,乘积最大的item会被选中。
注:item的straw值,是在crush map初始化的时候就完成的,这里就不介绍了,item的hash值的计算也不在该文中介绍。(后续单独分析)
Bucket algorithm 复杂度:
A bucket is a named container of other items (either devices or
other buckets). Items within a bucket are chosen using one of a
few different algorithms. The table summarizes how the speed of
each option measures up against mapping stability when items are
added or removed.
Bucket Alg Speed Additions Removals
------------------------------------------------
uniform O(1) poor poor
list O(n) optimal poor
tree O(log n) good good
straw O(n) better better
straw2 O(n) optimal optimal
crush 算法实现的基本还是比较清晰,这里只是简单分析(备忘),熟悉该算法非常有益于制定满足实际场景的crush rule。