使用gdb定位线程死锁问题

死锁概念

什么是死锁,简单地说死锁就是一种现象,比如说有A,B两个线程 ,mtx_a,mtx_b两个互斥量。如果线程A拥有mtx_a的同时,去竞争mtx_b,刚好线程B这时拥有mtx_b,去竞争mtx_a,这时两个线程想要竞争的资源刚好都被对方占用,且都等待对方释放资源,由此陷入了僵局,这就是线程死锁的一种现象。

 

死锁的例子

平常编写代码的过程中比较容易出现的死锁主要有下面这几种,每种情况我们写一个简单的例子把问题点暴露出来,当然还有其他的情况这里就不一一举例说明了。

1. 两个或两个以上的互斥锁,上锁顺序问题

/*
    两个线程,一个线程不停调用pushData,另一个线程不停调用popData。
    很容易出现死锁:pushData拥有mutex1,请求mutex2,同时popData拥有mutex2,请求mutex1。
*/
class dataPool
{
public:
    dataPool()=default;
    void pushData(const std::string& data)
	{
	    std::unique_lock lck1(mutex1);
	    std::unique_lock lck2(mutex2);
	    dataPool_.push_back(data);
	}
	void popData(const std::string& data)
	{
	    std::unique_lock lck2(mutex2);
	    std::unique_lock lck1(mutex1);
	    dataPool_.pop_front(data);
	}
private:
    std::mutex mutex1;
    std::mutex mutex2;
    std::list dataPool_;
}

2. 嵌套锁

/*
    如果代码调用层次较深,不留意时很容易发生这种嵌套上锁的情况。
    pushData中对mutex_上锁,下面调用findData时又请求mutex_,必现死锁。
*/
class dataPool
{
public:
	dataPool()=default;
	void pushData(int code,const std::string& data)
	{
		std::unique_lock lck(mutex_);
		if(!findData(code))
		{
			dataMap_[code]=data;
		}
	}
	bool findData(int code)
	{
		bool bRet=false;
		std::unique_lock lck(mutex_);
		auto it=dataMap_.find(code);
		if(it!=dataMap_.end())
		{
			bRet=true;
		}
		return bRet;
	}
private:
	std::mutex mutex_;
	std::map dataMap_;
}


3. 忘记解锁

/*
    忘记解锁也是一种情况,不过如果使用c++11中的版本,在作用域范围外自动解锁则可避免这种情况,
    好比智能指针。
*/
class dataPool
{
public:
	dataPool()=default;
	void pushData(const std::string& data)
	{
		mutex_.lock();
		dataPool_.push_back(data);
		mutex_.unlock();
	}
	void popData(const std::string& data)
	{
		mutex_.lock();
		dataPool_.pop_front(data);
                //mutex_.unlock();
	}
private:
	std::mutex mutex_;
	std::list dataPool_;
}

避免方法

1. 两个互斥量总是以相同的顺序上锁:总在互斥量B之前锁住互斥量B
2. 避免嵌套锁:不要尝试在一个线程里获取两次锁
3. 有lock必须在后面unlcok,或使用std::unique_lock进行包装

 

定位方法

如果还是不小心出现了死锁情况,我们就要学会定位。一种方法是通过日志去定位大概的死锁点,不过这种方法有时并不适用。这里我们主要介绍一下使用gdb调试的方法进行定位死锁问题。

下面这段代码加上主线程有三个线程,同时对数据区进行push和pop操作,操作中构造了死锁产生的条件以便复现死锁问题。

(如果gdb查看堆栈信息时有不显示代码行号的问题,就在编译进程时加上 -g 选项)

 

#include 
#include 
#include 
#include 
#include 
#include 
#define POOL_MAX_LEN (100)
#define POOL_DATA_MAX_SIZE (1*1024*1024)
using sourceData = struct sourceData_
{
	char* data;
	int dataLen;
};
struct delSourceData
{
	void operator()(sourceData* delData)
	{
		if (delData->data)
		{
			delete delData->data;
			delData->data = nullptr;
		}
		if (delData)
		{
			delete delData;
			delData = nullptr;
		}
	}
};
class DataPool
{
public:
	DataPool() :dataPool(100),
		poolBegin_(0),
		poolEnd_(0),
		poolEmpty_(true),
		poolFull_(false)
	{
		for (int idx = 0; idx < POOL_MAX_LEN;++idx)
		{
			std::shared_ptr tmpData(new sourceData, delSourceData());
			tmpData->data = new char[POOL_DATA_MAX_SIZE];
			dataPool[idx] = tmpData;
		}
	}
	~DataPool() = default;
	void pushToPool(std::shared_ptr inputData)
	{
		std::unique_lock lckOne(mutexOne_);
		std::unique_lock lckTwo(mutexTwo_);
		if (!inputData->data&&!poolFull_)
		{
			dataPool[poolEnd_]->dataLen = inputData->dataLen;
			int copyLen = inputData->dataLen>POOL_DATA_MAX_SIZE ? POOL_DATA_MAX_SIZE :
				inputData->dataLen;
			memcpy(dataPool[poolEnd_]->data, inputData->data, copyLen);
			poolEnd_ = (poolEnd_ + 1) % POOL_MAX_LEN;
			poolEmpty_ = false;
			poolFull_ = poolEmpty_ == poolFull_ ? true : false;
		}
	}
	void popFromPool(std::shared_ptr outData)
	{
		std::unique_lock lckTwo(mutexTwo_);
		std::unique_lock lckOne(mutexOne_);
		if (poolEmpty_)
		{
			outData->dataLen = dataPool[poolBegin_]->dataLen;
			memcpy(outData->data, dataPool[poolBegin_]->data, outData->dataLen);
			poolBegin_ = (poolBegin_ + 1) % POOL_MAX_LEN;
			poolFull_ = false;
			poolEmpty_ = poolEmpty_ == poolFull_ ? true : false;
		}
	}
	bool poolFull()
	{
		return poolFull_;
	}
	bool poolEmpty()
	{
		return poolEmpty_;
	}
private:
	std::vector > dataPool;
	std::mutex mutexOne_;
	std::mutex mutexTwo_;
	int poolBegin_;
	int poolEnd_;
	bool poolEmpty_;
	bool poolFull_;
};
class FetchData
{
public:
	FetchData() :pool_(std::make_shared())
	{

	}
	~FetchData() = default;
	void pushData()
	{
		while (true)
		{
			std::shared_ptr inputData(new sourceData, delSourceData());
			inputData->data = new char[POOL_DATA_MAX_SIZE];
			pool_->pushToPool(inputData);
		}
	}
	void popData()
	{
		while (true)
		{
			std::shared_ptr outData(new sourceData, delSourceData());
			outData->data = new char[POOL_DATA_MAX_SIZE];
			pool_->popFromPool(outData);
			std::cout << "easy to occur deadlock" << std::endl;
		}
	}
private:
	std::shared_ptr pool_;
};
int main()
{
	std::shared_ptr fetchData = std::make_shared();
	std::thread tOne(&FetchData::pushData, fetchData);
	std::thread tTwo(&FetchData::popData, fetchData);
	tOne.join();
	tTwo.join();
	std::cout << "over" << std::endl;

	return 0;
}
1. 编译完成后,运行之前要打开core dump开关
   ulimit -c unlimited

2. 运行程序
   [root@ia10k gdb]# ./a.out 
   only for a test
   only for a test

3. 查看进程id,使用kill命令产生core dump
   [root@ia10k gdb]# pidof a.out 
   21458
   [root@ia10k gdb]# kill -11 21458
   [root@ia10k gdb]# 
   # 这时可以看到进程已产生core dump
   [root@ia10k gdb]# ./a.out 
   only for a test
   only for a test
   Segmentation fault (core dumped)
   [root@ia10k gdb]# ls
   a.out  core.18734  deadlock.cpp
   [root@ia10k gdb]#  

4. gdb打开core文件
   [root@ia10k gdb]# gdb a.out core.18734

5. 打印所有线程的堆栈信息
(gdb) thread apply all bt

Thread 3 (Thread 0x7fdc50a40700 (LWP 18735)):
#0  0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
#1  0x00007fdc57990dcb in _L_lock_812 () from /usr/lib64/libpthread.so.0
#2  0x00007fdc57990c98 in pthread_mutex_lock () from /usr/lib64/libpthread.so.0
#3  0x00000000004010dc in __gthread_mutex_lock (__mutex=0xeab098) at /usr/include/c++/4.8.2/x86_64-redhat-linux/bits/gthr-default.h:748
#4  0x000000000040141e in std::mutex::lock (this=0xeab098) at /usr/include/c++/4.8.2/mutex:134
#5  0x0000000000402575 in std::unique_lock::lock (this=0x7fdc50a3fd80) at /usr/include/c++/4.8.2/mutex:511
#6  0x0000000000401f99 in std::unique_lock::unique_lock (this=0x7fdc50a3fd80, __m=...) at /usr/include/c++/4.8.2/mutex:443
#7  0x000000000040183f in DataPool::pushToPool (this=0xeab058, inputData=std::shared_ptr (count 2, weak 0) 0x7fdc480008c0) at deadlock.cpp:53
#8  0x0000000000401c45 in FetchData::pushData (this=0xeab028) at deadlock.cpp:109
#9  0x0000000000405414 in std::_Mem_fn::_M_call>(std::shared_ptr&&, void const volatile*) const (this=0xead6b0, 
    __ptr=) at /usr/include/c++/4.8.2/functional:558
#10 0x00000000004053a8 in std::_Mem_fn::operator(), , void>(std::shared_ptr&&) const (this=0xead6b0, 
    __object=) at /usr/include/c++/4.8.2/functional:610
#11 0x0000000000405261 in std::_Bind_simple (std::shared_ptr)>::_M_invoke<0ul>(std::_Index_tuple<0ul>) (this=0xead6a0)
    at /usr/include/c++/4.8.2/functional:1732
#12 0x00000000004050b1 in std::_Bind_simple (std::shared_ptr)>::operator()() (this=0xead6a0) at /usr/include/c++/4.8.2/functional:1720
#13 0x0000000000404f90 in std::thread::_Impl (std::shared_ptr)> >::_M_run() (this=0xead688)
    at /usr/include/c++/4.8.2/thread:115
#14 0x00007fdc577342b0 in ?? () from /usr/lib64/libstdc++.so.6
#15 0x00007fdc5798ee25 in start_thread () from /usr/lib64/libpthread.so.0
#16 0x00007fdc56e9c34d in clone () from /usr/lib64/libc.so.6

Thread 2 (Thread 0x7fdc5023f700 (LWP 18736)):
#0  0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
#1  0x00007fdc57990dcb in _L_lock_812 () from /usr/lib64/libpthread.so.0
#2  0x00007fdc57990c98 in pthread_mutex_lock () from /usr/lib64/libpthread.so.0
#3  0x00000000004010dc in __gthread_mutex_lock (__mutex=0xeab070) at /usr/include/c++/4.8.2/x86_64-redhat-linux/bits/gthr-default.h:748
#4  0x000000000040141e in std::mutex::lock (this=0xeab070) at /usr/include/c++/4.8.2/mutex:134
#5  0x0000000000402575 in std::unique_lock::lock (this=0x7fdc5023ed90) at /usr/include/c++/4.8.2/mutex:511
#6  0x0000000000401f99 in std::unique_lock::unique_lock (this=0x7fdc5023ed90, __m=...) at /usr/include/c++/4.8.2/mutex:443
#7  0x00000000004019f1 in DataPool::popFromPool (this=0xeab058, outData=std::shared_ptr (count 2, weak 0) 0x7fdc400008c0) at deadlock.cpp:68
#8  0x0000000000401d07 in FetchData::popData (this=0xeab028) at deadlock.cpp:119
#9  0x0000000000405414 in std::_Mem_fn::_M_call>(std::shared_ptr&&, void const volatile*) const (this=0xead960, 
    __ptr=) at /usr/include/c++/4.8.2/functional:558
#10 0x00000000004053a8 in std::_Mem_fn::operator(), , void>(std::shared_ptr&&) const (this=0xead960, 
    __object=) at /usr/include/c++/4.8.2/functional:610
#11 0x0000000000405261 in std::_Bind_simple (std::shared_ptr)>::_M_invoke<0ul>(std::_Index_tuple<0ul>) (this=0xead950)
---Type  to continue, or q  to quit---
    at /usr/include/c++/4.8.2/functional:1732
#12 0x00000000004050b1 in std::_Bind_simple (std::shared_ptr)>::operator()() (this=0xead950) at /usr/include/c++/4.8.2/functional:1720
#13 0x0000000000404f90 in std::thread::_Impl (std::shared_ptr)> >::_M_run() (this=0xead938)
    at /usr/include/c++/4.8.2/thread:115
#14 0x00007fdc577342b0 in ?? () from /usr/lib64/libstdc++.so.6
#15 0x00007fdc5798ee25 in start_thread () from /usr/lib64/libpthread.so.0
#16 0x00007fdc56e9c34d in clone () from /usr/lib64/libc.so.6

Thread 1 (Thread 0x7fdc57d9e740 (LWP 18734)):
#0  0x00007fdc5798ff57 in pthread_join () from /usr/lib64/libpthread.so.0
#1  0x00007fdc57734077 in std::thread::join() () from /usr/lib64/libstdc++.so.6
#2  0x0000000000401292 in main () at deadlock.cpp:131
(gdb) 
(gdb) 

6. 发现线程号为2和3的线程栈卡在lock_wait或者类似调用上,说明这几个线程极有可能产生死锁

7. info threads可以查看运行的线程,行首的数字表示gdb分配的线程号,切换线程时使用该号码,*表示的 是当前线程
(gdb) info threads
  Id   Target Id         Frame 
  3    Thread 0x7fdc50a40700 (LWP 18735) 0x00007fdc5799542d in __lll_lock_wait () from         /usr/lib64/libpthread.so.0
  2    Thread 0x7fdc5023f700 (LWP 18736) 0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
* 1    Thread 0x7fdc57d9e740 (LWP 18734) 0x00007fdc5798ff57 in pthread_join () from /usr/lib64/libpthread.so.0
(gdb) 

8. 利用thread ID命令切换到怀疑的线程,打印锁的信息,Owner字段表示哪个线程持有这把锁,它是线程的lwp号,可以通过info threads查看,这里我们查看线程3,并打印堆栈信息
(gdb) thread 3
[Switching to thread 3 (Thread 0x7fdc50a40700 (LWP 18735))]
#0  0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
(gdb) bt
#0  0x00007fdc5799542d in __lll_lock_wait () from /usr/lib64/libpthread.so.0
#1  0x00007fdc57990dcb in _L_lock_812 () from /usr/lib64/libpthread.so.0
#2  0x00007fdc57990c98 in pthread_mutex_lock () from /usr/lib64/libpthread.so.0
#3  0x00000000004010dc in __gthread_mutex_lock (__mutex=0xeab098) at /usr/include/c++/4.8.2/x86_64-redhat-linux/bits/gthr-default.h:748
#4  0x000000000040141e in std::mutex::lock (this=0xeab098) at /usr/include/c++/4.8.2/mutex:134
#5  0x0000000000402575 in std::unique_lock::lock (this=0x7fdc50a3fd80) at /usr/include/c++/4.8.2/mutex:511
#6  0x0000000000401f99 in std::unique_lock::unique_lock (this=0x7fdc50a3fd80, __m=...) at /usr/include/c++/4.8.2/mutex:443
#7  0x000000000040183f in DataPool::pushToPool (this=0xeab058, inputData=std::shared_ptr (count 2, weak 0) 0x7fdc480008c0) at deadlock.cpp:53
#8  0x0000000000401c45 in FetchData::pushData (this=0xeab028) at deadlock.cpp:109
#9  0x0000000000405414 in std::_Mem_fn::_M_call>(std::shared_ptr&&, void const volatile*) const (this=0xead6b0, 
    __ptr=) at /usr/include/c++/4.8.2/functional:558
#10 0x00000000004053a8 in std::_Mem_fn::operator(), , void>(std::shared_ptr&&) const (this=0xead6b0, 
    __object=) at /usr/include/c++/4.8.2/functional:610
#11 0x0000000000405261 in std::_Bind_simple (std::shared_ptr)>::_M_invoke<0ul>(std::_Index_tuple<0ul>) (this=0xead6a0)
    at /usr/include/c++/4.8.2/functional:1732
#12 0x00000000004050b1 in std::_Bind_simple (std::shared_ptr)>::operator()() (this=0xead6a0) at /usr/include/c++/4.8.2/functional:1720
#13 0x0000000000404f90 in std::thread::_Impl (std::shared_ptr)> >::_M_run() (this=0xead688)
    at /usr/include/c++/4.8.2/thread:115
#14 0x00007fdc577342b0 in ?? () from /usr/lib64/libstdc++.so.6
#15 0x00007fdc5798ee25 in start_thread () from /usr/lib64/libpthread.so.0
#16 0x00007fdc56e9c34d in clone () from /usr/lib64/libc.so.6
(gdb) 

9. 可以看到堆栈最终卡在源码中的第53行,堆栈的第7层,我们使用frame命令查看第7层具体信息
(gdb) frame 7
#7  0x000000000040183f in DataPool::pushToPool (this=0xeab058, inputData=std::shared_ptr (count 2, weak 0) 0x7fdc480008c0) at deadlock.cpp:53
53			std::unique_lock lckTwo(mutexTwo_);
(gdb) 

10. 查看一下lckTwo这把锁和mutexTwo_的信息
(gdb) p lckTwo 
$1 = {_M_device = 0xeab098, _M_owns = false}
(gdb) p mutexTwo_ 
$2 = { = {_M_mutex = {__data = {__lock = 2, __count = 0, __owner = 18736, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, 
      __size = "\002\000\000\000\000\000\000\000\060I\000\000\001", '\000' , __align = 2}}, }
(gdb) 
# 这里我们可以看到线程3当前没有拥有lckTwo这把锁,互斥量mutexTwo_当前被lwp号为18736的线程占有,即线程2,那么这个时候我们就找到了死锁的具体点,通过审查线程2执行的代码,死锁很快就能定位出来。我们这段代码是有意地构造死锁条件,进行逆推。不过真实开发中出现的死锁问题,也可以通过gdb调试结合core dump分析来解决。




 

你可能感兴趣的:(C++)