- builder.c
- builder.h
- crush.c
- crush.h
- CrushWrapper.cc
- CrushWrapper.h
- CrushWrapper.i
- grammar.h
- hash.c
- hash.h
- mapper.c
- mapper.h
- sample.txt
- types.h
<span style="font-size:12px;">struct crush_bucket { __s32 id; /* this'll be negative */ __u16 type; /* non-zero; type=0 is reserved for devices */ __u8 alg; /* one of CRUSH_BUCKET_* */ __u8 hash; /* which hash function to use, CRUSH_HASH_* */ __u32 weight; /* 16-bit fixed point */ __u32 size; /* num items */ //假如size为0,说明它不包含item。 __s32 *items; //数组,它包含item的id,这些id可能都是负数,也可能都是自然数。 //假如是负数,表示它包含的item都是bucket,假如是自然数,表示它的item都是device。 /* * cached random permutation: used for uniform bucket and for * the linear search fallback for the other bucket types. */ __u32 perm_x; /* @x for which *perm is defined */ __u32 perm_n; /* num elements of *perm that are permuted/defined */ __u32 *perm; }; </span>
构造、操作 crush_map、rule、bucket。
使用Robert Jenkins的HASH算法,地址是http://burtleburtle.net/bob/hash/evahash.html
我们的bucket有三种: root、rack、host。root包含的item是rack,root的结构是straw。rack包含的item是host,rack的结构是tree。host包括的item是device,host的结构式uniform。这是因为每个host包括的device的数量和权重是一定的,不会改变,因此要为host选择uniform结构,这样计算速度最快。
# crushtool --num_osds 24 -o crushmap --build host uniform 2 rack tree 4 root straw 0 # crushtool -d crushmap -o flat.txt
root@ceph-01:~# cat flat.txt
<span style="font-size:12px;"># begin crush map # devices device 0 device0 device 1 device1 device 2 device2 device 3 device3 device 4 device4 device 5 device5 device 6 device6 device 7 device7 device 8 device8 device 9 device9 device 10 device10 device 11 device11 device 12 device12 device 13 device13 device 14 device14 device 15 device15 device 16 device16 device 17 device17 device 18 device18 device 19 device19 device 20 device20 device 21 device21 device 22 device22 device 23 device23 # types type 0 device type 1 host type 2 rack type 3 root # buckets host host0 { id -1 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device0 weight 1.000 pos 0 item device1 weight 1.000 pos 1 } host host1 { id -2 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device2 weight 1.000 pos 0 item device3 weight 1.000 pos 1 } host host2 { id -3 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device4 weight 1.000 pos 0 item device5 weight 1.000 pos 1 } host host3 { id -4 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device6 weight 1.000 pos 0 item device7 weight 1.000 pos 1 } host host4 { id -5 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device8 weight 1.000 pos 0 item device9 weight 1.000 pos 1 } host host5 { id -6 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device10 weight 1.000 pos 0 item device11 weight 1.000 pos 1 } host host6 { id -7 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device12 weight 1.000 pos 0 item device13 weight 1.000 pos 1 } host host7 { id -8 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device14 weight 1.000 pos 0 item device15 weight 1.000 pos 1 } host host8 { id -9 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device16 weight 1.000 pos 0 item device17 weight 1.000 pos 1 } host host9 { id -10 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device18 weight 1.000 pos 0 item device19 weight 1.000 pos 1 } host host10 { id -11 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device20 weight 1.000 pos 0 item device21 weight 1.000 pos 1 } host host11 { id -12 # do not change unnecessarily # weight 2.000 alg uniform # do not change bucket size (2) unnecessarily hash 0 # rjenkins1 item device22 weight 1.000 pos 0 item device23 weight 1.000 pos 1 } rack rack0 { id -13 # do not change unnecessarily # weight 8.000 alg tree # do not change pos for existing items unnecessarily hash 0 # rjenkins1 item host0 weight 2.000 pos 0 item host1 weight 2.000 pos 1 item host2 weight 2.000 pos 2 item host3 weight 2.000 pos 3 } rack rack1 { id -14 # do not change unnecessarily # weight 8.000 alg tree # do not change pos for existing items unnecessarily hash 0 # rjenkins1 item host4 weight 2.000 pos 0 item host5 weight 2.000 pos 1 item host6 weight 2.000 pos 2 item host7 weight 2.000 pos 3 } rack rack2 { id -15 # do not change unnecessarily # weight 8.000 alg tree # do not change pos for existing items unnecessarily hash 0 # rjenkins1 item host8 weight 2.000 pos 0 item host9 weight 2.000 pos 1 item host10 weight 2.000 pos 2 item host11 weight 2.000 pos 3 } root root { id -16 # do not change unnecessarily # weight 24.000 alg straw hash 0 # rjenkins1 item rack0 weight 8.000 item rack1 weight 8.000 item rack2 weight 8.000 } # rules rule data { ruleset 1 type replicated min_size 2 max_size 2 step take root step chooseleaf firstn 0 type host step emit } # end crush map </span>
crush_map结构中的buckets成员是bucket结构指针数组,buckets成员保存了上面这些bucket结构的指针。上面这些bucket结构的指针在buckets中的下标是 [-1-id]。buckets数组的元素如下所示。
{ &host0, &host1, &host2, … , &host11, &rack0, &rack1, &rack2, &root}
pos | 1 | … | 11 | 12 | 13 | 14 | 15 | |
&bucket | &host0 | &host1 | … | &host11 | &rack0 | &rack1 | &rack2 | &root |
&bucked_id | -1 | -2 | … | -12 | -13 | -14 | -15 | -16 |
bucket的id使用负数是为了和device区分,因为bucket的item可以是device,也可以是bucket。比如host0的item数组中包含的元素是{0, 1},它们是device0、device1。而rack2的item数组中包含的元素是{-9, -10, -11, -12},它们是host8、host9、host10、host11。
step take root
step chooseleaf firstn 0 type host
step emit
第一个step是”step take root”,因此CRUSH会执行512~523行,因为”root”所对应的id是-16,因此w[0] = -16,wsize = 1。
然后CRUSH执行第二条 “step chooseleaf firstn 0 type host”, CRUSH会执行525~588行代码。CRUSH执行到541行时,firstn = 1, recurse_to_leaf = 1。
因为wsize = 1, 因此542行的循环只执行一次。
执行到568行时,numrep = 2, j = 0, i = 0 。
执行到569行时,会调用crush_choose函数。map->buckets[-1-w[i]] = &root。
当crush_choose第一次执行到356行时,in是&root bucket,r = 0。调用crush_bucket_choose函数。
CRUSH第二次执行365行时,in是&root bucket,r = 1。这次过程不再复述,假设CRUSH在root中选择了rack1,在rack1中选择了host6,在host6中选择了device13。
返回crush_do_rule函数,执行579~588行,则w数组中的元素是{4, 13}。
最后CRUSH会执行第三个 “step emit”,执行591~597行,把复制w数组到result数组上。
{4,13}代表device4和device13,这表明x对应的设备是{device4, device13}。
CRUSH算法完成了x到{device4, device13}的映射。