title: 解决一个部署osd时奇葩的报错问题
故障现象
ceph-deploy部署osd时如果不手动指定journal分区,让deploy自动去做journal分区就没问题,如果手动指定journal分区就会报错,过程如下:
prepare
prepare可以正常执行:
[root@wz_node86 ceph-deploy]# ceph-deploy osd prepare wz_node86:/dev/sdq2:/dev/sdq1
[ceph_deploy.conf][DEBUG ] found configuration file at: /root/.cephdeploy.conf
[ceph_deploy.cli][INFO ] Invoked (1.5.33): /usr/bin/ceph-deploy osd prepare wz_node86:/dev/sdq2:/dev/sdq1
[ceph_deploy.cli][INFO ] ceph-deploy options:
[ceph_deploy.cli][INFO ] username : None
[ceph_deploy.cli][INFO ] disk : [('wz_node86', '/dev/sdq2', '/dev/sdq1')]
[ceph_deploy.cli][INFO ] dmcrypt : False
[ceph_deploy.cli][INFO ] verbose : False
[ceph_deploy.cli][INFO ] bluestore : None
[ceph_deploy.cli][INFO ] pushy :
[ceph_deploy.cli][INFO ] overwrite_conf : False
[ceph_deploy.cli][INFO ] subcommand : prepare
[ceph_deploy.cli][INFO ] dmcrypt_key_dir : /etc/ceph/dmcrypt-keys
[ceph_deploy.cli][INFO ] quiet : False
[ceph_deploy.cli][INFO ] cd_conf :
[ceph_deploy.cli][INFO ] cluster : ceph
[ceph_deploy.cli][INFO ] fs_type : xfs
[ceph_deploy.cli][INFO ] func :
[ceph_deploy.cli][INFO ] ceph_conf : None
[ceph_deploy.cli][INFO ] default_release : False
[ceph_deploy.cli][INFO ] zap_disk : False
[ceph_deploy.osd][DEBUG ] Preparing cluster ceph disks wz_node86:/dev/sdq2:/dev/sdq1
[wz_node86][DEBUG ] connected to host: wz_node86
[wz_node86][DEBUG ] detect platform information from remote host
[wz_node86][DEBUG ] detect machine type
[wz_node86][DEBUG ] find the location of an executable
[ceph_deploy.osd][INFO ] Distro info: CentOS 7.2.1511 Core
[ceph_deploy.osd][DEBUG ] Deploying osd to wz_node86
[wz_node86][DEBUG ] write cluster configuration to /etc/ceph/{cluster}.conf
[ceph_deploy.osd][DEBUG ] Preparing host wz_node86 disk /dev/sdq2 journal /dev/sdq1 activate False
[wz_node86][DEBUG ] find the location of an executable
[wz_node86][INFO ] Running command: /usr/sbin/ceph-disk -v prepare --cluster ceph --fs-type xfs -- /dev/sdq2 /dev/sdq1
[wz_node86][WARNIN] command: Running command: /usr/bin/ceph-osd --cluster=ceph --show-config-value=fsid
[wz_node86][WARNIN] command: Running command: /usr/bin/ceph-osd --check-allows-journal -i 0 --cluster ceph --setuser ceph --setgroup ceph
[wz_node86][WARNIN] command: Running command: /usr/bin/ceph-osd --check-wants-journal -i 0 --cluster ceph --setuser ceph --setgroup ceph
[wz_node86][WARNIN] command: Running command: /usr/bin/ceph-osd --check-needs-journal -i 0 --cluster ceph --setuser ceph --setgroup ceph
[wz_node86][WARNIN] get_dm_uuid: get_dm_uuid /dev/sdq2 uuid path is /sys/dev/block/65:2/dm/uuid
[wz_node86][WARNIN] command: Running command: /usr/bin/ceph-osd --cluster=ceph --show-config-value=osd_journal_size
[wz_node86][WARNIN] get_dm_uuid: get_dm_uuid /dev/sdq2 uuid path is /sys/dev/block/65:2/dm/uuid
[wz_node86][WARNIN] get_dm_uuid: get_dm_uuid /dev/sdq2 uuid path is /sys/dev/block/65:2/dm/uuid
[wz_node86][WARNIN] command: Running command: /usr/bin/ceph-conf --cluster=ceph --name=osd. --lookup osd_mkfs_options_xfs
[wz_node86][WARNIN] command: Running command: /usr/bin/ceph-conf --cluster=ceph --name=osd. --lookup osd_mount_options_xfs
[wz_node86][WARNIN] get_dm_uuid: get_dm_uuid /dev/sdq1 uuid path is /sys/dev/block/65:1/dm/uuid
[wz_node86][WARNIN] prepare_device: Journal /dev/sdq1 is a partition
[wz_node86][WARNIN] get_dm_uuid: get_dm_uuid /dev/sdq1 uuid path is /sys/dev/block/65:1/dm/uuid
[wz_node86][WARNIN] prepare_device: OSD will not be hot-swappable if journal is not the same device as the osd data
[wz_node86][WARNIN] command: Running command: /usr/sbin/blkid -o udev -p /dev/sdq1
[wz_node86][WARNIN] prepare_device: Journal /dev/sdq1 was not prepared with ceph-disk. Symlinking directly.
[wz_node86][WARNIN] get_dm_uuid: get_dm_uuid /dev/sdq2 uuid path is /sys/dev/block/65:2/dm/uuid
[wz_node86][WARNIN] set_data_partition: OSD data device /dev/sdq2 is a partition
[wz_node86][WARNIN] get_dm_uuid: get_dm_uuid /dev/sdq2 uuid path is /sys/dev/block/65:2/dm/uuid
[wz_node86][WARNIN] command: Running command: /usr/sbin/blkid -o udev -p /dev/sdq2
[wz_node86][WARNIN] set_data_partition: incorrect partition UUID: 0fc63daf-8483-4772-8e79-3d69d8477de4, expected ['4fbd7e29-9d25-41b8-afd0-5ec00ceff05d', '4fbd7e29-9d25-41b8-afd0-062c0ceff05d', '4fbd7e29-8ae0-4982-bf9d-5a8d867af560', '4fbd7e29-9d25-41b8-afd0-35865ceff05d']
[wz_node86][WARNIN] populate_data_path_device: Creating xfs fs on /dev/sdq2
[wz_node86][WARNIN] command_check_call: Running command: /usr/sbin/mkfs -t xfs -f -i size=2048 -f -- /dev/sdq2
[wz_node86][DEBUG ] meta-data=/dev/sdq2 isize=2048 agcount=6, agsize=268435455 blks
[wz_node86][DEBUG ] = sectsz=4096 attr=2, projid32bit=1
[wz_node86][DEBUG ] = crc=0 finobt=0
[wz_node86][DEBUG ] data = bsize=4096 blocks=1463819665, imaxpct=5
[wz_node86][DEBUG ] = sunit=0 swidth=0 blks
[wz_node86][DEBUG ] naming =version 2 bsize=4096 ascii-ci=0 ftype=0
[wz_node86][DEBUG ] log =internal log bsize=4096 blocks=521728, version=2
[wz_node86][DEBUG ] = sectsz=4096 sunit=1 blks, lazy-count=1
[wz_node86][DEBUG ] realtime =none extsz=4096 blocks=0, rtextents=0
[wz_node86][WARNIN] mount: Mounting /dev/sdq2 on /var/lib/ceph/tmp/mnt.qlgXjP with options rw,noatime,inode64
[wz_node86][WARNIN] command_check_call: Running command: /usr/bin/mount -t xfs -o rw,noatime,inode64 -- /dev/sdq2 /var/lib/ceph/tmp/mnt.qlgXjP
[wz_node86][WARNIN] command: Running command: /usr/sbin/restorecon /var/lib/ceph/tmp/mnt.qlgXjP
[wz_node86][WARNIN] populate_data_path: Preparing osd data dir /var/lib/ceph/tmp/mnt.qlgXjP
[wz_node86][WARNIN] command: Running command: /usr/sbin/restorecon -R /var/lib/ceph/tmp/mnt.qlgXjP/ceph_fsid.671356.tmp
[wz_node86][WARNIN] command: Running command: /usr/bin/chown -R ceph:ceph /var/lib/ceph/tmp/mnt.qlgXjP/ceph_fsid.671356.tmp
[wz_node86][WARNIN] command: Running command: /usr/sbin/restorecon -R /var/lib/ceph/tmp/mnt.qlgXjP/fsid.671356.tmp
[wz_node86][WARNIN] command: Running command: /usr/bin/chown -R ceph:ceph /var/lib/ceph/tmp/mnt.qlgXjP/fsid.671356.tmp
[wz_node86][WARNIN] command: Running command: /usr/sbin/restorecon -R /var/lib/ceph/tmp/mnt.qlgXjP/magic.671356.tmp
[wz_node86][WARNIN] command: Running command: /usr/bin/chown -R ceph:ceph /var/lib/ceph/tmp/mnt.qlgXjP/magic.671356.tmp
[wz_node86][WARNIN] command: Running command: /usr/sbin/restorecon -R /var/lib/ceph/tmp/mnt.qlgXjP/journal_uuid.671356.tmp
[wz_node86][WARNIN] command: Running command: /usr/bin/chown -R ceph:ceph /var/lib/ceph/tmp/mnt.qlgXjP/journal_uuid.671356.tmp
[wz_node86][WARNIN] adjust_symlink: Creating symlink /var/lib/ceph/tmp/mnt.qlgXjP/journal -> /dev/sdq1
[wz_node86][WARNIN] command: Running command: /usr/sbin/restorecon -R /var/lib/ceph/tmp/mnt.qlgXjP
[wz_node86][WARNIN] command: Running command: /usr/bin/chown -R ceph:ceph /var/lib/ceph/tmp/mnt.qlgXjP
[wz_node86][WARNIN] unmount: Unmounting /var/lib/ceph/tmp/mnt.qlgXjP
[wz_node86][WARNIN] command_check_call: Running command: /bin/umount -- /var/lib/ceph/tmp/mnt.qlgXjP
[wz_node86][WARNIN] get_dm_uuid: get_dm_uuid /dev/sdq2 uuid path is /sys/dev/block/65:2/dm/uuid
[wz_node86][INFO ] checking OSD status...
[wz_node86][DEBUG ] find the location of an executable
[wz_node86][INFO ] Running command: /bin/ceph --cluster=ceph osd stat --format=json
[wz_node86][WARNIN] there is 1 OSD down
[wz_node86][WARNIN] there is 1 OSD out
[ceph_deploy.osd][DEBUG ] Host wz_node86 is now ready for osd use.
activate
activate的时候报错如下:
[root@wz_node86 ceph-deploy]# ceph-deploy osd activate wz_node86:/dev/sdq2
[ceph_deploy.conf][DEBUG ] found configuration file at: /root/.cephdeploy.conf
[ceph_deploy.cli][INFO ] Invoked (1.5.33): /usr/bin/ceph-deploy osd activate wz_node86:/dev/sdq2:/dev/sdq1
[ceph_deploy.cli][INFO ] ceph-deploy options:
[ceph_deploy.cli][INFO ] username : None
[ceph_deploy.cli][INFO ] verbose : False
[ceph_deploy.cli][INFO ] pushy :
[ceph_deploy.cli][INFO ] overwrite_conf : False
[ceph_deploy.cli][INFO ] subcommand : activate
[ceph_deploy.cli][INFO ] quiet : False
[ceph_deploy.cli][INFO ] cd_conf :
[ceph_deploy.cli][INFO ] cluster : ceph
[ceph_deploy.cli][INFO ] func :
[ceph_deploy.cli][INFO ] ceph_conf : None
[ceph_deploy.cli][INFO ] default_release : False
[ceph_deploy.cli][INFO ] disk : [('wz_node86', '/dev/sdq2', '/dev/sdq1')]
[ceph_deploy.osd][DEBUG ] Activating cluster ceph disks wz_node86:/dev/sdq2:/dev/sdq1
[wz_node86][DEBUG ] connected to host: wz_node86
[wz_node86][DEBUG ] detect platform information from remote host
[wz_node86][DEBUG ] detect machine type
[wz_node86][DEBUG ] find the location of an executable
[ceph_deploy.osd][INFO ] Distro info: CentOS 7.2.1511 Core
[ceph_deploy.osd][DEBUG ] activating host wz_node86 disk /dev/sdq2
[ceph_deploy.osd][DEBUG ] will use init type: systemd
[wz_node86][DEBUG ] find the location of an executable
[wz_node86][INFO ] Running command: /usr/sbin/ceph-disk -v activate --mark-init systemd --mount /dev/sdq2
[wz_node86][WARNIN] main_activate: path = /dev/sdq2
[wz_node86][WARNIN] get_dm_uuid: get_dm_uuid /dev/sdq2 uuid path is /sys/dev/block/65:2/dm/uuid
[wz_node86][WARNIN] command: Running command: /usr/sbin/blkid -o udev -p /dev/sdq2
[wz_node86][WARNIN] command: Running command: /sbin/blkid -p -s TYPE -o value -- /dev/sdq2
[wz_node86][WARNIN] command: Running command: /usr/bin/ceph-conf --cluster=ceph --name=osd. --lookup osd_mount_options_xfs
[wz_node86][WARNIN] mount: Mounting /dev/sdq2 on /var/lib/ceph/tmp/mnt.XLOFBd with options rw,noatime,inode64
[wz_node86][WARNIN] command_check_call: Running command: /usr/bin/mount -t xfs -o rw,noatime,inode64 -- /dev/sdq2 /var/lib/ceph/tmp/mnt.XLOFBd
[wz_node86][WARNIN] command: Running command: /usr/sbin/restorecon /var/lib/ceph/tmp/mnt.XLOFBd
[wz_node86][WARNIN] activate: Cluster uuid is 03069551-6d93-4600-96ad-63da5a862809
[wz_node86][WARNIN] command: Running command: /usr/bin/ceph-osd --cluster=ceph --show-config-value=fsid
[wz_node86][WARNIN] activate: Cluster name is ceph
[wz_node86][WARNIN] activate: OSD uuid is dd0b0be6-aab1-471c-9039-a240498fb696
[wz_node86][WARNIN] activate: OSD id is 0
[wz_node86][WARNIN] activate: Initializing OSD...
[wz_node86][WARNIN] command_check_call: Running command: /usr/bin/ceph --cluster ceph --name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring mon getmap -o /var/lib/ceph/tmp/mnt.XLOFBd/activate.monmap
[wz_node86][WARNIN] got monmap epoch 3
[wz_node86][WARNIN] command_check_call: Running command: /usr/bin/ceph-osd --cluster ceph --mkfs --mkkey -i 0 --monmap /var/lib/ceph/tmp/mnt.XLOFBd/activate.monmap --osd-data /var/lib/ceph/tmp/mnt.XLOFBd --osd-journal /var/lib/ceph/tmp/mnt.XLOFBd/journal --osd-uuid dd0b0be6-aab1-471c-9039-a240498fb696 --keyring /var/lib/ceph/tmp/mnt.XLOFBd/keyring --setuser ceph --setgroup ceph
[wz_node86][WARNIN] 2019-05-14 18:10:02.783268 7f3ac70ae800 -1 journal check: ondisk fsid 00000000-0000-0000-0000-000000000000 doesn't match expected dd0b0be6-aab1-471c-9039-a240498fb696, invalid (someone else's?) journal
[wz_node86][WARNIN] 2019-05-14 18:10:02.783695 7f3ac70ae800 -1 journal FileJournal::_open: unable to setup io_context (0) Success
[wz_node86][WARNIN] 2019-05-14 18:10:02.783747 7f3ac70ae800 -1 journal FileJournal::create : create write header error (9) Bad file descriptor
[wz_node86][WARNIN] 2019-05-14 18:10:02.783761 7f3ac70ae800 -1 journal FileJournal::create: error closing fd: (9) Bad file descriptor
[wz_node86][WARNIN] 2019-05-14 18:10:02.783813 7f3ac70ae800 -1 filestore(/var/lib/ceph/tmp/mnt.XLOFBd) mkjournal error creating journal on /var/lib/ceph/tmp/mnt.XLOFBd/journal: (9) Bad file descriptor
[wz_node86][WARNIN] 2019-05-14 18:10:02.783864 7f3ac70ae800 -1 OSD::mkfs: ObjectStore::mkfs failed with error -9
[wz_node86][WARNIN] 2019-05-14 18:10:02.783938 7f3ac70ae800 -1 ** ERROR: error creating empty object store in /var/lib/ceph/tmp/mnt.XLOFBd: (9) Bad file descriptor
[wz_node86][WARNIN] mount_activate: Failed to activate
[wz_node86][WARNIN] unmount: Unmounting /var/lib/ceph/tmp/mnt.XLOFBd
[wz_node86][WARNIN] command_check_call: Running command: /bin/umount -- /var/lib/ceph/tmp/mnt.XLOFBd
[wz_node86][WARNIN] Traceback (most recent call last):
[wz_node86][WARNIN] File "/usr/sbin/ceph-disk", line 9, in
[wz_node86][WARNIN] load_entry_point('ceph-disk==1.0.0', 'console_scripts', 'ceph-disk')()
[wz_node86][WARNIN] File "/usr/lib/python2.7/site-packages/ceph_disk/main.py", line 4970, in run
[wz_node86][WARNIN] main(sys.argv[1:])
[wz_node86][WARNIN] File "/usr/lib/python2.7/site-packages/ceph_disk/main.py", line 4921, in main
[wz_node86][WARNIN] args.func(args)
[wz_node86][WARNIN] File "/usr/lib/python2.7/site-packages/ceph_disk/main.py", line 3275, in main_activate
[wz_node86][WARNIN] reactivate=args.reactivate,
[wz_node86][WARNIN] File "/usr/lib/python2.7/site-packages/ceph_disk/main.py", line 3032, in mount_activate
[wz_node86][WARNIN] (osd_id, cluster) = activate(path, activate_key_template, init)
[wz_node86][WARNIN] File "/usr/lib/python2.7/site-packages/ceph_disk/main.py", line 3208, in activate
[wz_node86][WARNIN] keyring=keyring,
[wz_node86][WARNIN] File "/usr/lib/python2.7/site-packages/ceph_disk/main.py", line 2701, in mkfs
[wz_node86][WARNIN] '--setgroup', get_ceph_group(),
[wz_node86][WARNIN] File "/usr/lib/python2.7/site-packages/ceph_disk/main.py", line 439, in command_check_call
[wz_node86][WARNIN] return subprocess.check_call(arguments)
[wz_node86][WARNIN] File "/usr/lib64/python2.7/subprocess.py", line 542, in check_call
[wz_node86][WARNIN] raise CalledProcessError(retcode, cmd)
[wz_node86][WARNIN] subprocess.CalledProcessError: Command '['/usr/bin/ceph-osd', '--cluster', 'ceph', '--mkfs', '--mkkey', '-i', '0', '--monmap', '/var/lib/ceph/tmp/mnt.XLOFBd/activate.monmap', '--osd-data', '/var/lib/ceph/tmp/mnt.XLOFBd', '--osd-journal', '/var/lib/ceph/tmp/mnt.XLOFBd/journal', '--osd-uuid', 'dd0b0be6-aab1-471c-9039-a240498fb696', '--keyring', '/var/lib/ceph/tmp/mnt.XLOFBd/keyring', '--setuser', 'ceph', '--setgroup', 'ceph']' returned non-zero exit status 1
[wz_node86][ERROR ] RuntimeError: command returned non-zero exit status: 1
[ceph_deploy][ERROR ] RuntimeError: Failed to execute command: /usr/sbin/ceph-disk -v activate --mark-init systemd --mount /dev/sdq2
分析解决
关键的错误日志如下:
[wz_node86][WARNIN] command_check_call: Running command: /usr/bin/ceph-osd --cluster ceph --mkfs --mkkey -i 0 --monmap /var/lib/ceph/tmp/mnt.XLOFBd/activate.monmap --osd-data /var/lib/ceph/tmp/mnt.XLOFBd --osd-journal /var/lib/ceph/tmp/mnt.XLOFBd/journal --osd-uuid dd0b0be6-aab1-471c-9039-a240498fb696 --keyring /var/lib/ceph/tmp/mnt.XLOFBd/keyring --setuser ceph --setgroup ceph
[wz_node86][WARNIN] 2019-05-14 18:10:02.783268 7f3ac70ae800 -1 journal check: ondisk fsid 00000000-0000-0000-0000-000000000000 doesn't match expected dd0b0be6-aab1-471c-9039-a240498fb696, invalid (someone else's?) journal
[wz_node86][WARNIN] 2019-05-14 18:10:02.783695 7f3ac70ae800 -1 journal FileJournal::_open: unable to setup io_context (0) Success
[wz_node86][WARNIN] 2019-05-14 18:10:02.783747 7f3ac70ae800 -1 journal FileJournal::create : create write header error (9) Bad file descriptor
[wz_node86][WARNIN] 2019-05-14 18:10:02.783761 7f3ac70ae800 -1 journal FileJournal::create: error closing fd: (9) Bad file descriptor
[wz_node86][WARNIN] 2019-05-14 18:10:02.783813 7f3ac70ae800 -1 filestore(/var/lib/ceph/tmp/mnt.XLOFBd) mkjournal error creating journal on /var/lib/ceph/tmp/mnt.XLOFBd/journal: (9) Bad file descriptor
[wz_node86][WARNIN] 2019-05-14 18:10:02.783864 7f3ac70ae800 -1 OSD::mkfs: ObjectStore::mkfs failed with error -9
[wz_node86][WARNIN] 2019-05-14 18:10:02.783938 7f3ac70ae800 -1 ** ERROR: error creating empty object store in /var/lib/ceph/tmp/mnt.XLOFBd: (9) Bad file descriptor
看到错误日志开始怀疑是journal分区没有清理干净,导致prepare的时候没有正确配置journal导致的,但是使用ceph-disk zap和 sgdisk -z把磁盘都清理干净了还是不行。然后根据错误日志查找相关代码,看到osd在启动的时候读取journal分区的如下代码:
if (aio) {
aio_ctx = 0;
ret = io_setup(128, &aio_ctx);
if (ret < 0) {
ret = errno;
derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(ret) << dendl;
ret = -ret;
goto out_fd;
}
}
可以看到如果osd的journal是单独的分区时,osd是使用aio的方式去open journal的,然后查资料找到了应该内核限制了aio的请求数。然后增加下面的内核参数:
echo 131072 > /proc/sys/fs/aio-max-nr
这个参数在centos 7.2里面默认是65536,在7.6里面把这个参数增加到了1048576。推测后面应用软件会大量使用aio的方式来优化性能,所以可以适当的增加该内核参数。
好了,问题解决了,可以正常部署osd了。