lifecycle policy的存储机制
当我们为一个bucket配置lifecycle policy时,lifecycle相关的数据会存储在2个位置:
- 在bucket.instance对象的xattr中写入key为
user.rgw.lc
value为LifecycleConfig的属性(真正的lifecycle rules列表)。 - 在lc.0 - lc.31共32个对象(这个值可配置)中选择其中一个,向其omap写入lifecycle的状态信息。但其实在该omap对应的header中也有lc相关的信息,比如用于记录当前omap的lifecycle遍历进度的marker,但这些数据不是在设置lc时设置的。
下面我们验证下:
首先,我们通过boto向一个名为bucket1的bucket配置lifecycle policy:
bucket1 = conn.get_bucket('bucket1')
expir = Expiration(days=1)
lc = Lifecycle()
lc.add_rule(
prefix = "test/",
expiration = expir,
)
bucket1.configure_lifecycle(lc)
之后,去该bucket1对应的bucket.instance对象的xattr中查看。
$ rados -p default.rgw.meta ls --namespace root
bucket1
.bucket.meta.bucket1:38d08ed7-3883-49de-ab89-0dea7c8c960f.4162.1
$ rados -p default.rgw.meta --namespace root listxattr .bucket.meta.bucket1:38d08ed7-3883-49de-ab89-0dea7c8c960f.4162.1
ceph.objclass.version
user.rgw.acl
user.rgw.lc
user.rgw.lc
对应的value就是该bucket的lifecycle rule列表。
然后,再去查看lc.xx对象
$ rados -p default.rgw.log --namespace=lc ls
lc.6
lc.14
lc.29
lc.8
lc.10
lc.26
lc.22
lc.17
lc.27
lc.4
lc.11
lc.18
lc.20
lc.7
lc.2
lc.13
lc.16
lc.12
lc.30
lc.24
lc.9
lc.15
lc.19
lc.21
lc.23
lc.31
lc.25
lc.5
lc.3
lc.28
lc.1
lc.0
RGWPutLC::execute()代码
lifecycle的组织方式也可以在put lc操作的代码中窥见一斑。
void RGWPutLC::execute()
{
bufferlist bl;
RGWLifecycleConfiguration_S3 *config = NULL;
RGWLCXMLParser_S3 parser(s->cct);
RGWLifecycleConfiguration_S3 new_config(s->cct);
// 从http header中取出md5到content_md5
content_md5 = s->info.env->get("HTTP_CONTENT_MD5");
if (content_md5 == nullptr) {
op_ret = -ERR_INVALID_REQUEST;
s->err.message = "Missing required header for this request: Content-MD5";
ldout(s->cct, 5) << s->err.message << dendl;
return;
}
// 将取出的md5从base64解码到content_md5_bin
std::string content_md5_bin;
try {
content_md5_bin = rgw::from_base64(boost::string_view(content_md5));
} catch (...) {
s->err.message = "Request header Content-MD5 contains character "
"that is not base64 encoded.";
ldout(s->cct, 5) << s->err.message << dendl;
op_ret = -ERR_BAD_DIGEST;
return;
}
if (!parser.init()) {
op_ret = -EINVAL;
return;
}
// 从req_state中解析出put lc所需的参数存入RGWPutLC.data长度为RGWPutLC.len
op_ret = get_params();
if (op_ret < 0)
return;
ldout(s->cct, 15) << "read len=" << len << " data=" << (data ? data : "") << dendl;
// 计算params的MD5
MD5 data_hash;
unsigned char data_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE];
data_hash.Update(reinterpret_cast(data), len);
data_hash.Final(data_hash_res);
// 比较计算出的md5和客户端传入的md5是否一致,以判断数据是否损坏
if (memcmp(data_hash_res, content_md5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) {
op_ret = -ERR_BAD_DIGEST;
s->err.message = "The Content-MD5 you specified did not match what we received.";
ldout(s->cct, 5) << s->err.message
<< " Specified content md5: " << content_md5
<< ", calculated content md5: " << data_hash_res
<< dendl;
return;
}
// 将data中的参数数据解析到parser对象中
if (!parser.parse(data, len, 1)) {
op_ret = -ERR_MALFORMED_XML;
return;
}
// 解析出的xml对象是一颗树结构
/*
class XMLObj
{
XMLObj *parent;
......
multimap children;
......
}
*/
// 如上,每一个标签作为一个节点,分别包含指向其父节点的指针和孩子节点的指针列表
config = static_cast(parser.find_first("LifecycleConfiguration"));
if (!config) {
op_ret = -ERR_MALFORMED_XML;
return;
}
// 将config中的rule_map中的rule转存到new_config的rule_map和prefix_map中
op_ret = config->rebuild(store, new_config);
if (op_ret < 0)
return;
if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
ldout(s->cct, 15) << "New LifecycleConfiguration:";
new_config.to_xml(*_dout);
*_dout << dendl;
}
// 将rule_map编码存入bl,并copy一个attrs map,增加RGW_ATTR_LC->bl项,
new_config.encode(bl);
map attrs;
attrs = s->bucket_attrs;
attrs[RGW_ATTR_LC] = bl;
// 将新的attrs写入bucket.instance对象的xattr中,
op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, &s->bucket_info.objv_tracker);
if (op_ret < 0)
return;
string shard_id = s->bucket.tenant + ':' + s->bucket.name + ':' + s->bucket.bucket_id;
string oid;
// 从default.rgw.log pool中的32个lc.xx对象中选择一个,构造oid,xx表示一个0-31的整数
// get_lc_oid代码如下:
/*
static void get_lc_oid(struct req_state *s, string& oid){
string shard_id = s->bucket.name + ':' +s->bucket.bucket_id;
int max_objs = (s->cct->_conf->rgw_lc_max_objs > HASH_PRIME)?HASH_PRIME:s->cct->_conf->rgw_lc_max_objs;
int index = ceph_str_hash_linux(shard_id.c_str(), shard_id.size()) % HASH_PRIME % max_objs;
oid = lc_oid_prefix;
char buf[32];
snprintf(buf, 32, ".%d", index);
oid.append(buf);
return;
}
*/
get_lc_oid(s, oid);
// 构造要写入omap的entry内容
pair entry(shard_id, lc_uninitial);
int max_lock_secs = s->cct->_conf->rgw_lc_lock_max_time;
rados::cls::lock::Lock l(lc_index_lock_name);
utime_t time(max_lock_secs, 0);
l.set_duration(time);
l.set_cookie(cookie);
librados::IoCtx *ctx = store->get_lc_pool_ctx();
do {
op_ret = l.lock_exclusive(ctx, oid);
if (op_ret == -EBUSY) {
dout(0) << "RGWLC::RGWPutLC() failed to acquire lock on, sleep 5, try again" << oid << dendl;
sleep(5);
continue;
}
if (op_ret < 0) {
dout(0) << "RGWLC::RGWPutLC() failed to acquire lock " << oid << op_ret << dendl;
break;
}
// 在lc.xx对象关联的omap中写入entry
op_ret = cls_rgw_lc_set_entry(*ctx, oid, entry);
if (op_ret < 0) {
dout(0) << "RGWLC::RGWPutLC() failed to set entry " << oid << op_ret << dendl;
}
break;
}while(1);
l.unlock(ctx, oid);
return;
}
lifecycle的作用机制
RGWLC类是负责执行lc的类,它会根据用户的配置开启1个或多个worker线程,这些worker线程的任务是在一个无限循环中,每隔一段时间(生产环境是一天一次,测试环境比较频繁)判断一下当前是否应该执行lifecycle的遍历工作,如果是的话,调用RGWLC类的process方法,随机选择32个lc.xx对象中的一个,根据其header中的标记,取出其未遍历的下一个omap entry,更新header中的标记,更新该entry的状态为processing,然后处理该entry,遍历该条entry对应的bucket中的所有对象,根据lc规则删除或转换bucket中过期的object,并写日志。
代码追踪如下:
下面这个函数是worker线程的的执行内容,可以看到,它在一个while循环中,每隔一段时间判断should_work,如果通过的话,那么就调用lc->process()函数进行遍历,然后设置下一次被唤醒的时间,进入阻塞状态。
void *RGWLC::LCWorker::entry() {
do {
utime_t start = ceph_clock_now();
if (should_work(start)) {
dout(2) << "life cycle: start" << dendl;
int r = lc->process();
if (r < 0) {
dout(0) << "ERROR: do life cycle process() returned error r=" << r << dendl;
}
dout(2) << "life cycle: stop" << dendl;
}
if (lc->going_down())
break;
utime_t end = ceph_clock_now();
int secs = schedule_next_start_time(start, end);
utime_t next;
next.set_from_double(end + secs);
dout(5) << "schedule life cycle next start time: " << rgw_to_asctime(next) <going_down());
return NULL;
}
在RGWLC::process函数中,主要做了以下几件事:
1.从lc.xx对象的header中获得omap中要遍历的下一个entry
2.将拿到的entry的状态设为processing(正在处理)
3.更新header中记录的下一个entry
4.调用bucket_lc_process
函数处理当前的entry对应的lc规则
int RGWLC::process(int index, int max_lock_secs)
{
rados::cls::lock::Lock l(lc_index_lock_name);
do {
utime_t now = ceph_clock_now();
pair entry;//string = bucket_name:bucket_id ,int = LC_BUCKET_STATUS
if (max_lock_secs <= 0)
return -EAGAIN;
utime_t time(max_lock_secs, 0);
l.set_duration(time);
int ret = l.lock_exclusive(&store->lc_pool_ctx, obj_names[index]);
if (ret == -EBUSY) { /* already locked by another lc processor */
dout(0) << "RGWLC::process() failed to acquire lock on, sleep 5, try again" << obj_names[index] << dendl;
sleep(5);
continue;
}
if (ret < 0)
return 0;
// 读取lc.xx对象的head
string marker;
cls_rgw_lc_obj_head head;
ret = cls_rgw_lc_get_head(store->lc_pool_ctx, obj_names[index], head);
if (ret < 0) {
dout(0) << "RGWLC::process() failed to get obj head " << obj_names[index] << ret << dendl;
goto exit;
}
if(!if_already_run_today(head.start_date)) {
head.start_date = now;
head.marker.clear();
ret = bucket_lc_prepare(index);
if (ret < 0) {
dout(0) << "RGWLC::process() failed to update lc object " << obj_names[index] << ret << dendl;
goto exit;
}
}
// 从lc.xx对象的header中获取下一个要遍历的omap entry
ret = cls_rgw_lc_get_next_entry(store->lc_pool_ctx, obj_names[index], head.marker, entry);
if (ret < 0) {
dout(0) << "RGWLC::process() failed to get obj entry " << obj_names[index] << dendl;
goto exit;
}
if (entry.first.empty())
goto exit;
// 将该entry的状态设为processing
entry.second = lc_processing;
ret = cls_rgw_lc_set_entry(store->lc_pool_ctx, obj_names[index], entry);
if (ret < 0) {
dout(0) << "RGWLC::process() failed to set obj entry " << obj_names[index] << entry.first << entry.second << dendl;
goto exit;
}
// 更新header中的下一个entry标记
head.marker = entry.first;
ret = cls_rgw_lc_put_head(store->lc_pool_ctx, obj_names[index], head);
if (ret < 0) {
dout(0) << "RGWLC::process() failed to put head " << obj_names[index] << dendl;
goto exit;
}
l.unlock(&store->lc_pool_ctx, obj_names[index]);
// 处理当前的entry对应的lc规则
ret = bucket_lc_process(entry.first);
bucket_lc_post(index, max_lock_secs, entry, ret);
}while(1);
exit:
l.unlock(&store->lc_pool_ctx, obj_names[index]);
return 0;
}
而bucket_lc_process
函数做的则就是最终的处理工作了:遍历某条lc规则对应的bucket的所有objects,根据prefix和tagging找到lc 规则作用的object,然后判断这些objects是否过期,如果过期,做对应的删除处理。
要注意的是,目前L版本的ceph仅支持到期删除的lifecycle,也就是Expiration。不支持Transition。