作者: Yang Honggang
本文基于 ceph-10.2.2/Centos7.2 介绍如何使用 lttng trace ceph osd 的性能.
# yum install lttng-tools lttng-ust
// 查看 trace 结果的工具
# yum install babeltrace
// 我们只是trace user space 程序,关闭对kernel的trace
# lttng-sessiond -d --no-kernel
// 这是xtao ceph的rpm编译方式,不是官网编译方式
# ./configure --with-lttng --with-rpmcleanbuild
# cd extras/LinuxRPM
# make cephrpms
也可以强制打开 WITH_LTTNG 宏
#define WITH_LTTNG
修改集群配置文件
[osd.59]
...
osd tracing = true
osd objectstore tracing = true
rados tracing = true
启动ceph-osd
# export LD_PRELOAD=/usr/lib64/liblttng-ust-fork.so && ceph-osd -i 59 -c /etc/ceph/xtao.conf --cluster xtao
[root@xt5 ~]# lttng list -u
UST events:
-------------
...
PID: 13400 - Name: ceph-osd
pg:queue_op (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_post (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_unknown (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_copy_from (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_copy_get (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_copy_get_classic (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_omaprmkeys (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_omapclear (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_omapsetheader (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_omapsetvals (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_omap_cmp (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_omapgetvalsbykeys (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_omapgetheader (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_omapgetvals (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_omapgetkeys (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_tmap2omap (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_tmapup (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_tmapput (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_tmapget (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_startsync (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_append (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
osd:do_osd_op_pre_rmxattr (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
....
[root@xt5 ~]# lttng create osd.59.s1
Session osd.59.s1 created.
Traces will be written in /root/lttng-traces/osd.59.s1-20171109-103424
// 使能 osd/pg/objectstore的 tracepoints
[root@xt5 ~]# lttng enable-event -u objectstore:*
UST event objectstore:* created in channel channel0
[root@xt5 ~]# lttng enable-event -u osd:*
UST event osd:* created in channel channel0
[root@xt5 ~]# lttng enable-event -u pg:*
UST event pg:* created in channel channel0
[root@xt5 ~]# lttng start
Tracing started for session osd.59.s1
// rgw 写操作
[root@xt8 cos]# sh cli.sh submit conf/write/64m.conf
[root@xt5 ~]# lttng stop
Waiting for data availability.
Tracing stopped for session osd.59.s1
查看生成的文件
[root@xt5 ~]# find lttng-traces/
lttng-traces/
lttng-traces/osd.59.s1-20171109-103424
lttng-traces/osd.59.s1-20171109-103424/ust
lttng-traces/osd.59.s1-20171109-103424/ust/uid
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/channel0_0
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_0.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_1.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_2.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_3.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_4.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_5.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_6.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_7.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_8.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_9.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_10.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_11.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_12.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_13.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_14.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/index/channel0_15.idx
lttng-traces/osd.59.s1-20171109-103424/ust/uid/0/64-bit/channel0_1
...
// 不会删除生成的trace数据
# lttng destroy
Session osd.59.s1 destroyed
// 或者 # lttng view > result.all
[root@xt5 ~]# babeltrace lttng-traces > result.all
[root@xt5 ~]# du -sh result.all
35M result.all
$ sudo rm /usr/bin/python -rf
$ sudo ln -s /usr/bin/python3 /usr/bin/python
$ python -V
如果yum不能使用了,需要yum
// #!/usr/bin/python 改为 #!/usr/bin/python2.7
$ sudo vim /usr/bin/yum
Note:: 对于 Ubuntu 和 Debian 可以直接安装 python3-babeltrace
$ wget http://www.efficios.com/files/babeltrace/babeltrace-1.5.3.tar.bz2
$ tar -jxf babeltrace-XXX.tar.bz2
$ cd babeltrace-1.5.3/
$ ./configure --enable-python-bindings
$ make -j4
$ sudo make install
python包安装到了 /usr/local/lib64/python3.4/site-packages/babeltrace**
export PYTHONPATH=/usr/local/lib64/python3.4/site-packages/babeltrace:$PYTHONPATH
库文件安装到了 /usr/local/lib 目录
$ sudo vim /etc/ld.so.conf.d/sb.conf
$ sudo ldconfig
$ cat /etc/ld.so.conf.d/sb.conf
/usr/local/lib
无需修改代码就可以增加的trace 事件有
# lttng add-context -u --list
pid
procname
prio
nice
vpid
tid
pthread_id
vtid
ppid
vppid
hostname
ip
interruptible
preemptible
need_reschedule
migratable
perf:cpu:cpu-cycles
perf:cpu:cycles
...
这里以 FileStore::_do_transaction 中的 enter/exit tracepoint 为例子,增加 pthread_id,用于
// 无法处理下面的配对
[10:42:03.143188578] (+0.000009326) xt5 objectstore:write_enter: { cpu_id = 0 }, { osr_name = "97.15s3", offset = 1398016, length = 1376 }
[10:42:03.143478705] (+0.000009134) xt5 objectstore:write_enter: { cpu_id = 3 }, { osr_name = "97.2bcs2", offset = 0, length = 1398016 }
[10:42:03.143607910] (+0.000129205) xt5 objectstore:write_exit: { cpu_id = 10 }, { retval = 1376 }
[10:42:03.144973679] (+0.001220396) xt5 objectstore:write_exit: { cpu_id = 3 }, { retval = 1398016 }
增加该context后,再次trace
# lttng add-context -u -t pthread_id
UST context pthread_id added to all channels
[09:33:50.192905016] (+0.000008206) xt5 objectstore:write_enter: { cpu_id = 14 }, { pthread_id = 139909850027776 }, { osr_name = "97.319s2", offset = 0, length = 1398016 }
[09:33:50.193288708] (+0.000004744) xt5 objectstore:write_enter: { cpu_id = 11 }, { pthread_id = 139909858420480 }, { osr_name = "97.35es2", offset = 1398016, length = 1376 }
[09:33:50.193309999] (+0.000021291) xt5 objectstore:write_exit: { cpu_id = 11 }, { pthread_id = 139909858420480 }, { retval = 1376 }
[09:33:50.193558676] (+0.000006137) xt5 objectstore:write_enter: { cpu_id = 11 }, { pthread_id = 139909858420480 }, { osr_name = "97.68s2", offset = 1398016, length = 1376 }
[09:33:50.193576254] (+0.000017578) xt5 objectstore:write_exit: { cpu_id = 11 }, { pthread_id = 139909858420480 }, { retval = 1376 }
[09:33:50.194999952] (+0.000134932) xt5 objectstore:write_exit: { cpu_id = 14 }, { pthread_id = 139909850027776 }, { retval = 1398016 }
需要自己写脚本来处理trace的结果。下面给出一个示例:
objectstore.py
#!/usr/bin/env python
# -*- coding: utf-8 -*
# vi:set tw=0 ts=4 sw=4 nowrap fdm=indent
# python dump.py ../data/ltt.752M/osd.59.s1-20171109-103424/ust/uid/0/64-bit/
# Yang Honggang
import json
import sys
from babeltrace import *
# all events
g_events = set()
g_summary = {}
def update_summary(evt_name, cost):
g_events.add(evt_name)
if evt_name not in g_summary:
g_summary[evt_name] = {
'max': cost,
'min': cost,
'sum': cost,
'count': 1
}
else:
e = g_summary[evt_name]
if e['max'] < cost:
e['max'] = cost
if cost < e['min']:
e['min'] = cost
e['sum'] += cost
e['count'] += 1
def update():
for k, v in g_summary.items():
v['avg'] = float(v['sum']) // v['count']
v['avg'] /= float(1000000000)
v['min'] /= float(1000000000)
v['max'] /= float(1000000000)
v['sum'] /= float(1000000000)
if __name__ == '__main__':
traces = TraceCollection()
ret = traces.add_trace(sys.argv[1], "ctf")
event_type = 'all'
if len(sys.argv) == 3:
event_type = str(sys.argv[2])
events_enter = {}
events_enter_set = set()
for event in traces.events:
# filter
if event_type != 'all' and event_type not in event.name:
continue
if event.name.startswith('objectstore:') and event.name.endswith('_enter'):
op_tag = event.name + str(event['pthread_id'])
# print ('enqueue %s' % op_tag)
assert(op_tag not in events_enter_set), ("dump events_enter_set: %s" % str(events_enter_set))
events_enter_set.add(op_tag)
events_enter[op_tag] = {
'ts': event.timestamp,
}
if event.name.startswith('objectstore:') and event.name.endswith('_exit'):
op_tag = event.name[:-4] # get rid of tail 'exit'
op_tag += 'enter' + str(event['pthread_id'])
# print ('dequeue %s' % op_tag)
try:
e = events_enter[op_tag]
cost = event.timestamp - e['ts']
update_summary(event.name[:-5], cost)
del events_enter[op_tag]
events_enter_set.remove(op_tag)
print ('%s %d' % (event.name[:-5], cost))
except KeyError:
print ('drop %s' % str(event))
# update avg
update()
print (json.dumps(g_summary, indent=4))
// all 表示展示所有objectstore事件
// 也可以指定 rmkeys 等其他具体事件
$ python objectstore.py ../data/pid/osd.59.s2-20171110-092920/ust/uid/0/64-bit/ all
...
objectstore:getattr 147437
objectstore:omap_setkeys 102982
objectstore:coll_move_rename 10100543
{
"objectstore:omap_rmkeys": {
"max": 0.001172804,
"min": 0.000141518,
"count": 15,
"avg": 0.000486413,
"sum": 0.007296197
},
...
#!/usr/bin/env python
# -*- coding: utf-8 -*
# vi:set tw=0 ts=4 sw=4 nowrap fdm=indent
# python dump.py ../data/ltt.752M/osd.59.s1-20171109-103424/ust/uid/0/64-bit/
# Yang Honggang
import json
import sys
from babeltrace import *
traces = TraceCollection()
ret = traces.add_trace(sys.argv[1], "ctf")
for event in traces.events:
# import pdb;pdb.set_trace()
item = {}
item['ts'] = event.timestamp
item['name'] = event.name
item['body'] = dict(event)
item['body']['uuid'] = str(item['body']['uuid'])
print (json.dumps(dict(item), indent=4))
dump.py脚本:
$ python dump.py ../data/pid/osd.59.s2-20171110-092920/ust/uid/0/64-bit/ | more
{
"name": "objectstore:remove_enter",
"ts": 1510277629825390663,
"body": {
"v": {
"timestamp": 165143897895942,
"id": 89
},
"osr_name": "97.1b6s1",
"cpu_id": 6,
"content_size": 1061576,
"timestamp_end": 166030605917153,
"packet_size": 1081344,
"timestamp_begin": 164884122908956,
"pthread_id": 139909858420480,
"id": "extended",
"uuid": "[115, 96, 191, 41, 224, 55, 68, 67, 152, 213, 166, 29, 106, 229, 239, 70]",
"events_discarded": 0,
"magic": 3254525889,
"stream_id": 0,
"packet_seq_num": 0,
"stream_instance_id": 6
}
}
[1] tracing ceph with lttng https://nwat.io/blog/2014/06/01/tracing-ceph-with-lttng/
[2] tracing you own user application http://lttng.org/docs/v2.10/#doc-tracing-your-own-user-application
[3] LTTng installation on CentOS 7.2 http://frederic-wou.net/lttng/
[4] 端到端trace ceph http://victoraraujo.me/babeltrace-zipkin/
[5] Tracing Ceph With BlkKin http://docs.ceph.com/docs/master/dev/blkin/
[6] Unable to see LTTng tracepoints in Ceph http://ceph-users.ceph.narkive.com/8gg0Rt9H/unable-to-see-lttng-tracepoints-in-ceph
[7] babeltrace python bindings 的 api http://diamon.org/babeltrace/docs/python/reader