什么是死锁,简单地说死锁就是一种现象,比如说有A,B两个线程 ,mtx_a,mtx_b两个互斥量。如果线程A拥有mtx_a的同时,去竞争mtx_b,刚好线程B这时拥有mtx_b,去竞争mtx_a,这时两个线程想要竞争的资源刚好都被对方占用,且都等待对方释放资源,由此陷入了僵局,这就是线程死锁的一种现象。
平常编写代码的过程中比较容易出现的死锁主要有下面这几种,每种情况我们写一个简单的例子把问题点暴露出来,当然还有其他的情况这里就不一一举例说明了。
1. 两个或两个以上的互斥锁,上锁顺序问题
/*
两个线程,一个线程不停调用pushData,另一个线程不停调用popData。
很容易出现死锁:pushData拥有mutex1,请求mutex2,同时popData拥有mutex2,请求mutex1。
*/
class dataPool
{
public:
dataPool()=default;
void pushData(const std::string& data)
{
std::unique_lock lck1(mutex1);
std::unique_lock lck2(mutex2);
dataPool_.push_back(data);
}
void popData(const std::string& data)
{
std::unique_lock lck2(mutex2);
std::unique_lock lck1(mutex1);
dataPool_.pop_front(data);
}
private:
std::mutex mutex1;
std::mutex mutex2;
std::list dataPool_;
}
2. 嵌套锁
/*
如果代码调用层次较深,不留意时很容易发生这种嵌套上锁的情况。
pushData中对mutex_上锁,下面调用findData时又请求mutex_,必现死锁。
*/
class dataPool
{
public:
dataPool()=default;
void pushData(int code,const std::string& data)
{
std::unique_lock lck(mutex_);
if(!findData(code))
{
dataMap_[code]=data;
}
}
bool findData(int code)
{
bool bRet=false;
std::unique_lock lck(mutex_);
auto it=dataMap_.find(code);
if(it!=dataMap_.end())
{
bRet=true;
}
return bRet;
}
private:
std::mutex mutex_;
std::map dataMap_;
}
3. 忘记解锁
/*
忘记解锁也是一种情况,不过如果使用c++11中的版本,在作用域范围外自动解锁则可避免这种情况,
好比智能指针。
*/
class dataPool
{
public:
dataPool()=default;
void pushData(const std::string& data)
{
mutex_.lock();
dataPool_.push_back(data);
mutex_.unlock();
}
void popData(const std::string& data)
{
mutex_.lock();
dataPool_.pop_front(data);
//mutex_.unlock();
}
private:
std::mutex mutex_;
std::list dataPool_;
}
1. 两个互斥量总是以相同的顺序上锁:总在互斥量B之前锁住互斥量B
2. 避免嵌套锁:不要尝试在一个线程里获取两次锁
3. 有lock必须在后面unlcok,或使用std::unique_lock进行包装
如果还是不小心出现了死锁情况,我们就要学会定位。一种方法是通过日志去定位大概的死锁点,不过这种方法有时并不适用。这里我们主要介绍一下使用gdb调试的方法进行定位死锁问题。
下面这段代码加上主线程有三个线程,同时对数据区进行push和pop操作,操作中构造了死锁产生的条件以便复现死锁问题。
(如果gdb查看堆栈信息时有不显示代码行号的问题,就在编译进程时加上 -g 选项)
#include
#include
#include
#include
#include
#include
#define POOL_MAX_LEN (100)
#define POOL_DATA_MAX_SIZE (1*1024*1024)
using sourceData = struct sourceData_
{
char* data;
int dataLen;
};
struct delSourceData
{
void operator()(sourceData* delData)
{
if (delData->data)
{
delete delData->data;
delData->data = nullptr;
}
if (delData)
{
delete delData;
delData = nullptr;
}
}
};
class DataPool
{
public:
DataPool() :dataPool(100),
poolBegin_(0),
poolEnd_(0),
poolEmpty_(true),
poolFull_(false)
{
for (int idx = 0; idx < POOL_MAX_LEN;++idx)
{
std::shared_ptr tmpData(new sourceData, delSourceData());
tmpData->data = new char[POOL_DATA_MAX_SIZE];
dataPool[idx] = tmpData;
}
}
~DataPool() = default;
void pushToPool(std::shared_ptr inputData)
{
std::unique_lock lckOne(mutexOne_);
std::unique_lock lckTwo(mutexTwo_);
if (!inputData->data&&!poolFull_)
{
dataPool[poolEnd_]->dataLen = inputData->dataLen;
int copyLen = inputData->dataLen>POOL_DATA_MAX_SIZE ? POOL_DATA_MAX_SIZE :
inputData->dataLen;
memcpy(dataPool[poolEnd_]->data, inputData->data, copyLen);
poolEnd_ = (poolEnd_ + 1) % POOL_MAX_LEN;
poolEmpty_ = false;
poolFull_ = poolEmpty_ == poolFull_ ? true : false;
}
}
void popFromPool(std::shared_ptr outData)
{
std::unique_lock lckTwo(mutexTwo_);
std::unique_lock lckOne(mutexOne_);
if (poolEmpty_)
{
outData->dataLen = dataPool[poolBegin_]->dataLen;
memcpy(outData->data, dataPool[poolBegin_]->data, outData->dataLen);
poolBegin_ = (poolBegin_ + 1) % POOL_MAX_LEN;
poolFull_ = false;
poolEmpty_ = poolEmpty_ == poolFull_ ? true : false;
}
}
bool poolFull()
{
return poolFull_;
}
bool poolEmpty()
{
return poolEmpty_;
}
private:
std::vector > dataPool;
std::mutex mutexOne_;
std::mutex mutexTwo_;
int poolBegin_;
int poolEnd_;
bool poolEmpty_;
bool poolFull_;
};
class FetchData
{
public:
FetchData() :pool_(std::make_shared())
{
}
~FetchData() = default;
void pushData()
{
while (true)
{
std::shared_ptr inputData(new sourceData, delSourceData());
inputData->data = new char[POOL_DATA_MAX_SIZE];
pool_->pushToPool(inputData);
}
}
void popData()
{
while (true)
{
std::shared_ptr outData(new sourceData, delSourceData());
outData->data = new char[POOL_DATA_MAX_SIZE];
pool_->popFromPool(outData);
std::cout << "easy to occur deadlock" << std::endl;
}
}
private:
std::shared_ptr pool_;
};
int main()
{
std::shared_ptr fetchData = std::make_shared();
std::thread tOne(&FetchData::pushData, fetchData);
std::thread tTwo(&FetchData::popData, fetchData);
tOne.join();
tTwo.join();
std::cout << "over" << std::endl;
return 0;
}
1. 编译完成后,运行之前要打开core dump开关
ulimit -c unlimited
2. 运行程序
[root@ia10k gdb]# ./a.out
only for a test
only for a test
3. 查看进程id,使用kill命令产生core dump
[root@ia10k gdb]# pidof a.out
21458
[root@ia10k gdb]# kill -11 21458
[root@ia10k gdb]#
# 这时可以看到进程已产生core dump
[root@ia10k gdb]# ./a.out
only for a test
only for a test
Segmentation fault (core dumped)
[root@ia10k gdb]# ls
a.out core.18734 deadlock.cpp
[root@ia10k gdb]#
4. gdb打开core文件
[root@ia10k gdb]# gdb a.out core.18734
5. 打印所有线程的堆栈信息
(gdb) thread apply all bt
Thread 3 (Thread 0x7fdc50a40700 (LWP 18735)):
#0 0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
#1 0x00007fdc57990dcb in _L_lock_812 () from /usr/lib64/libpthread.so.0
#2 0x00007fdc57990c98 in pthread_mutex_lock () from /usr/lib64/libpthread.so.0
#3 0x00000000004010dc in __gthread_mutex_lock (__mutex=0xeab098) at /usr/include/c++/4.8.2/x86_64-redhat-linux/bits/gthr-default.h:748
#4 0x000000000040141e in std::mutex::lock (this=0xeab098) at /usr/include/c++/4.8.2/mutex:134
#5 0x0000000000402575 in std::unique_lock::lock (this=0x7fdc50a3fd80) at /usr/include/c++/4.8.2/mutex:511
#6 0x0000000000401f99 in std::unique_lock::unique_lock (this=0x7fdc50a3fd80, __m=...) at /usr/include/c++/4.8.2/mutex:443
#7 0x000000000040183f in DataPool::pushToPool (this=0xeab058, inputData=std::shared_ptr (count 2, weak 0) 0x7fdc480008c0) at deadlock.cpp:53
#8 0x0000000000401c45 in FetchData::pushData (this=0xeab028) at deadlock.cpp:109
#9 0x0000000000405414 in std::_Mem_fn::_M_call>(std::shared_ptr&&, void const volatile*) const (this=0xead6b0,
__ptr=) at /usr/include/c++/4.8.2/functional:558
#10 0x00000000004053a8 in std::_Mem_fn::operator(), , void>(std::shared_ptr&&) const (this=0xead6b0,
__object=) at /usr/include/c++/4.8.2/functional:610
#11 0x0000000000405261 in std::_Bind_simple (std::shared_ptr)>::_M_invoke<0ul>(std::_Index_tuple<0ul>) (this=0xead6a0)
at /usr/include/c++/4.8.2/functional:1732
#12 0x00000000004050b1 in std::_Bind_simple (std::shared_ptr)>::operator()() (this=0xead6a0) at /usr/include/c++/4.8.2/functional:1720
#13 0x0000000000404f90 in std::thread::_Impl (std::shared_ptr)> >::_M_run() (this=0xead688)
at /usr/include/c++/4.8.2/thread:115
#14 0x00007fdc577342b0 in ?? () from /usr/lib64/libstdc++.so.6
#15 0x00007fdc5798ee25 in start_thread () from /usr/lib64/libpthread.so.0
#16 0x00007fdc56e9c34d in clone () from /usr/lib64/libc.so.6
Thread 2 (Thread 0x7fdc5023f700 (LWP 18736)):
#0 0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
#1 0x00007fdc57990dcb in _L_lock_812 () from /usr/lib64/libpthread.so.0
#2 0x00007fdc57990c98 in pthread_mutex_lock () from /usr/lib64/libpthread.so.0
#3 0x00000000004010dc in __gthread_mutex_lock (__mutex=0xeab070) at /usr/include/c++/4.8.2/x86_64-redhat-linux/bits/gthr-default.h:748
#4 0x000000000040141e in std::mutex::lock (this=0xeab070) at /usr/include/c++/4.8.2/mutex:134
#5 0x0000000000402575 in std::unique_lock::lock (this=0x7fdc5023ed90) at /usr/include/c++/4.8.2/mutex:511
#6 0x0000000000401f99 in std::unique_lock::unique_lock (this=0x7fdc5023ed90, __m=...) at /usr/include/c++/4.8.2/mutex:443
#7 0x00000000004019f1 in DataPool::popFromPool (this=0xeab058, outData=std::shared_ptr (count 2, weak 0) 0x7fdc400008c0) at deadlock.cpp:68
#8 0x0000000000401d07 in FetchData::popData (this=0xeab028) at deadlock.cpp:119
#9 0x0000000000405414 in std::_Mem_fn::_M_call>(std::shared_ptr&&, void const volatile*) const (this=0xead960,
__ptr=) at /usr/include/c++/4.8.2/functional:558
#10 0x00000000004053a8 in std::_Mem_fn::operator(), , void>(std::shared_ptr&&) const (this=0xead960,
__object=) at /usr/include/c++/4.8.2/functional:610
#11 0x0000000000405261 in std::_Bind_simple (std::shared_ptr)>::_M_invoke<0ul>(std::_Index_tuple<0ul>) (this=0xead950)
---Type to continue, or q to quit---
at /usr/include/c++/4.8.2/functional:1732
#12 0x00000000004050b1 in std::_Bind_simple (std::shared_ptr)>::operator()() (this=0xead950) at /usr/include/c++/4.8.2/functional:1720
#13 0x0000000000404f90 in std::thread::_Impl (std::shared_ptr)> >::_M_run() (this=0xead938)
at /usr/include/c++/4.8.2/thread:115
#14 0x00007fdc577342b0 in ?? () from /usr/lib64/libstdc++.so.6
#15 0x00007fdc5798ee25 in start_thread () from /usr/lib64/libpthread.so.0
#16 0x00007fdc56e9c34d in clone () from /usr/lib64/libc.so.6
Thread 1 (Thread 0x7fdc57d9e740 (LWP 18734)):
#0 0x00007fdc5798ff57 in pthread_join () from /usr/lib64/libpthread.so.0
#1 0x00007fdc57734077 in std::thread::join() () from /usr/lib64/libstdc++.so.6
#2 0x0000000000401292 in main () at deadlock.cpp:131
(gdb)
(gdb)
6. 发现线程号为2和3的线程栈卡在lock_wait或者类似调用上,说明这几个线程极有可能产生死锁
7. info threads可以查看运行的线程,行首的数字表示gdb分配的线程号,切换线程时使用该号码,*表示的 是当前线程
(gdb) info threads
Id Target Id Frame
3 Thread 0x7fdc50a40700 (LWP 18735) 0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
2 Thread 0x7fdc5023f700 (LWP 18736) 0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
* 1 Thread 0x7fdc57d9e740 (LWP 18734) 0x00007fdc5798ff57 in pthread_join () from /usr/lib64/libpthread.so.0
(gdb)
8. 利用thread ID命令切换到怀疑的线程,打印锁的信息,Owner字段表示哪个线程持有这把锁,它是线程的lwp号,可以通过info threads查看,这里我们查看线程3,并打印堆栈信息
(gdb) thread 3
[Switching to thread 3 (Thread 0x7fdc50a40700 (LWP 18735))]
#0 0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
(gdb) bt
#0 0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
#1 0x00007fdc57990dcb in _L_lock_812 () from /usr/lib64/libpthread.so.0
#2 0x00007fdc57990c98 in pthread_mutex_lock () from /usr/lib64/libpthread.so.0
#3 0x00000000004010dc in __gthread_mutex_lock (__mutex=0xeab098) at /usr/include/c++/4.8.2/x86_64-redhat-linux/bits/gthr-default.h:748
#4 0x000000000040141e in std::mutex::lock (this=0xeab098) at /usr/include/c++/4.8.2/mutex:134
#5 0x0000000000402575 in std::unique_lock::lock (this=0x7fdc50a3fd80) at /usr/include/c++/4.8.2/mutex:511
#6 0x0000000000401f99 in std::unique_lock::unique_lock (this=0x7fdc50a3fd80, __m=...) at /usr/include/c++/4.8.2/mutex:443
#7 0x000000000040183f in DataPool::pushToPool (this=0xeab058, inputData=std::shared_ptr (count 2, weak 0) 0x7fdc480008c0) at deadlock.cpp:53
#8 0x0000000000401c45 in FetchData::pushData (this=0xeab028) at deadlock.cpp:109
#9 0x0000000000405414 in std::_Mem_fn::_M_call>(std::shared_ptr&&, void const volatile*) const (this=0xead6b0,
__ptr=) at /usr/include/c++/4.8.2/functional:558
#10 0x00000000004053a8 in std::_Mem_fn::operator(), , void>(std::shared_ptr&&) const (this=0xead6b0,
__object=) at /usr/include/c++/4.8.2/functional:610
#11 0x0000000000405261 in std::_Bind_simple (std::shared_ptr)>::_M_invoke<0ul>(std::_Index_tuple<0ul>) (this=0xead6a0)
at /usr/include/c++/4.8.2/functional:1732
#12 0x00000000004050b1 in std::_Bind_simple (std::shared_ptr)>::operator()() (this=0xead6a0) at /usr/include/c++/4.8.2/functional:1720
#13 0x0000000000404f90 in std::thread::_Impl (std::shared_ptr)> >::_M_run() (this=0xead688)
at /usr/include/c++/4.8.2/thread:115
#14 0x00007fdc577342b0 in ?? () from /usr/lib64/libstdc++.so.6
#15 0x00007fdc5798ee25 in start_thread () from /usr/lib64/libpthread.so.0
#16 0x00007fdc56e9c34d in clone () from /usr/lib64/libc.so.6
(gdb)
9. 可以看到堆栈最终卡在源码中的第53行,堆栈的第7层,我们使用frame命令查看第7层具体信息
(gdb) frame 7
#7 0x000000000040183f in DataPool::pushToPool (this=0xeab058, inputData=std::shared_ptr (count 2, weak 0) 0x7fdc480008c0) at deadlock.cpp:53
53 std::unique_lock lckTwo(mutexTwo_);
(gdb)
10. 查看一下lckTwo这把锁和mutexTwo_的信息
(gdb) p lckTwo
$1 = {_M_device = 0xeab098, _M_owns = false}
(gdb) p mutexTwo_
$2 = { = {_M_mutex = {__data = {__lock = 2, __count = 0, __owner = 18736, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\060I\000\000\001", '\000' , __align = 2}}, }
(gdb)
# 这里我们可以看到线程3当前没有拥有lckTwo这把锁,互斥量mutexTwo_当前被lwp号为18736的线程占有,即线程2,那么这个时候我们就找到了死锁的具体点,通过审查线程2执行的代码,死锁很快就能定位出来。我们这段代码是有意地构造死锁条件,进行逆推。不过真实开发中出现的死锁问题,也可以通过gdb调试结合core dump分析来解决。