处理这个死锁问题,花了好几天,相信遇到的同学,一样头疼,但有个好辅助类的话(好在.net的API足够强大),就没这么头疼了
本篇文章的解决方案只适合使用lock(obj),或是:Monitor.Enter(obj); …. Monitor.Exit(obj)的方式
类似酱紫的死锁
如果你使用的是:AutoResetEvent.Set/Rest, Monitor.Wait/Pulse, Mutex的方式,请另寻他法。
//------------------------------------------------------------------------
// Craeted by Jave.Lin 4/21/2018 5:31:57 PM
//------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Threading;
//namespace Common.ComUtil
//{
///
/// Locker Information
/// author : Jave.Lin
/// date : 4/21/2018 5:31:57 PM
///
public class Locker
{
public object tag; // 附带上下文数据
public int threadID; // 获取锁的线程ID
public string name; // 锁的名称
public int lockedTimes; // 累积获取锁多少次,便于分析死锁几率
public int lockingTs; // 获取锁那刻的时间戳
public bool enter; // 获取锁的标记
public bool exit; // 释放锁的标记
public string lockingStackTrace; // 获取锁那刻的调用方法栈,下文Dumps信息实例可以看到很清楚
public int lockingDLTs; // 识别为死锁时设置的时间戳
public List pRecordList;
public Locker(string name)
{
this.name = name;
pRecordList = new List();
}
public void PushRecord()
{
pRecordList.Add(RetrieveRecord());
}
public void ClearRecord()
{
pRecordList.Clear();
}
public Locker RetrieveRecord()
{
var ret = new Locker(this.name);
ret.tag = tag;
ret.threadID = threadID;
ret.name = name;
ret.lockedTimes = lockedTimes;
ret.lockingTs = lockingTs;
ret.enter = enter;
ret.exit = exit;
ret.lockingStackTrace = lockingStackTrace;
ret.lockingDLTs = lockingDLTs;
return ret;
}
public override string ToString()
{
string[] strs = new string[]
{
"ThreadId:" + threadID,
"Name:" + name,
"LockedTimes:" + lockedTimes,
"LockingTs:" + lockingTs,
"LocingEt:" + (lockingDLTs - lockingTs) + ("(ms"),
"Enter:" + enter,
"Exit:" + exit,
"Tag:" + (tag != null ? tag.ToString() : "null"),
"LockingStackTrace:\n" + lockingStackTrace,
(pRecordList.Count > 0 ? "LockedRecord:\n\t" + string.Join("\t-record--------------", pRecordList) : "")
};
return string.Join("\r\n", strs);
}
}
public delegate void OnDL();
///
/// Check Dead Lock (CDL)
/// author : Jave.Lin
/// date : 4/21/2018 5:31:57 PM
///
public static class CDL
{
// had been locked in map
public static readonly Dictionarybool> _s_pLockedMap = new Dictionarybool>();
// the last
public static readonly Dictionary> _s_pLockingMap = new Dictionary>();
public const bool THROW_ER = true;
public const int DEAD_LOCK_TIME_OUT = 3000; // 这个阀值按需调整,实际的线上产品服务器程序如果负载过大时,可能也会有部分任务处理过久,导致‘取锁’等待过久
public static event OnDL OnDLEvent;
private static void _PushToWaitQueue(Locker locker)
{
List list = null;
if (!_s_pLockingMap.TryGetValue(locker, out list))
{
list = new List();
_s_pLockingMap[locker] = list;
}
list.Add(locker.RetrieveRecord());
}
private static void _ClearFromLocking(Locker locker)
{
List list = null;
if (_s_pLockingMap.TryGetValue(locker, out list))
{
list.Clear();
_s_pLockingMap.Remove(locker);
}
}
private static void _BeforeEnter(Locker locker)
{
if (locker.enter)
{
locker.PushRecord();
_PushToWaitQueue(locker);
}
}
private static void _Enter(Locker locker)
{
locker.enter = true;
locker.exit = false;
locker.lockingTs = Environment.TickCount;
locker.threadID = Thread.CurrentThread.ManagedThreadId;
locker.lockingStackTrace = GetCurStackTrace("->\n");
Interlocked.Increment(ref locker.lockedTimes);
_s_pLockedMap[locker] = true;
}
private static void _Exit(Locker locker)
{
if (!Monitor.IsEntered(locker))
{
locker.lockingTs = Environment.TickCount;
var msg = "!Monitor.IsEntered(locker)";
if (THROW_ER)
{
throw new Exception(msg);
}
else
{
_WarningWriteLine(msg);
}
}
else
{
locker.exit = true;
_s_pLockedMap.Remove(locker);
_ClearFromLocking(locker);
locker.ClearRecord();
Monitor.Exit(locker);
}
}
private static string _GetWaitQueue(Locker locker)
{
if (_s_pLockingMap.ContainsKey(locker))
{
return string.Join("\n@@@@@", _s_pLockingMap[locker]);
}
return "";
}
public static string Dumps()
{
var itemList = new List();
var contentList = new List<string>();
foreach (var item in _s_pLockedMap)
{
itemList.Add(item.Key);
}
itemList.Sort((a, b) =>
{
return (b.lockingDLTs - b.lockingTs) - (a.lockingDLTs - a.lockingTs);
});
foreach (var item in itemList)
{
contentList.Add(item.ToString() + "\n$$$$$$$$$$Before Locking WaitQueue$$$$$$$$\n" + _GetWaitQueue(item));
}
return string.Join("\r\n=line============\r\n", contentList);
}
public static void CheckDL(Locker locker, Action actoin)
{
try
{
_BeforeEnter(locker);
if (Monitor.TryEnter(locker, DEAD_LOCK_TIME_OUT))
{
_Enter(locker);
actoin.Invoke();
}
else
{
locker.lockingDLTs = Environment.TickCount;
_WarningWriteLine("TryEnter time out");
if (THROW_ER)
{
_ShowGetLockTimeout();
}
else
{
actoin.Invoke();
}
}
}
catch (Exception e)
{
_ErrorWriteLine(e.ToString());
}
finally
{
_Exit(locker);
}
}
public static T CheckDL(Locker locker, Func actoin)
{
T ret = default(T);
try
{
_BeforeEnter(locker);
if (Monitor.TryEnter(locker, DEAD_LOCK_TIME_OUT))
{
_Enter(locker);
ret = actoin.Invoke();
}
else
{
locker.lockingDLTs = Environment.TickCount;
_WarningWriteLine("TryEnter time out");
if (THROW_ER)
{
_ShowGetLockTimeout();
}
else
{
actoin.Invoke();
}
}
}
catch (Exception e)
{
_ErrorWriteLine(e.ToString());
}
finally
{
_Exit(locker);
}
return ret;
}
public static string GetCurStackTrace(string separactor = "->")
{
System.Diagnostics.StackTrace st = new System.Diagnostics.StackTrace();
System.Diagnostics.StackFrame[] sfs = st.GetFrames();
List<string> methodNameList = new List<string>();
for (int i = 1; i < sfs.Length; ++i)
{
if (System.Diagnostics.StackFrame.OFFSET_UNKNOWN == sfs[i].GetILOffset()) break;
var m = sfs[i].GetMethod();
var dn = m.DeclaringType.Name;
var mn = m.Name;
methodNameList.Add(new string(' ', sfs.Length - i) + dn + "::" + mn + "()");
}
methodNameList.Reverse();
return string.Join(separactor, methodNameList);
}
private static void _ShowGetLockTimeout()
{
OnDLEvent?.Invoke();
var msg = "!!!!!!!!!!!!!!!!!!DeadLock!!!!!!!!!!!!!!!!!!!!!!!!!!";
_ErrorWriteLine(msg);
throw new Exception(msg);
}
private static void _ErrorWriteLine(string msg, params object[] args)
{
var srcColor = Console.ForegroundColor;
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine(msg, args);
Console.ForegroundColor = srcColor;
}
private static void _WarningWriteLine(string msg, params object[] args)
{
var srcColor = Console.ForegroundColor;
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine(msg, args);
Console.ForegroundColor = srcColor;
}
}
//}
// 我们平常用的:
lock(obj)
{
// code here
}
改写成:
CDL.CheckDL(obj, ()=>
{
// code here
});
// 怎么方便改,是个问题
// 用到的lock少的话,手动一个个改吧
// 如果巨量的话,建议CTRL + SHIFT + H来批量替换Lock的代码吧(写个正则)
// 将:CDL的namespace去掉,这样就不用导namespace了。
一旦有死锁出现,那么将会命中CDL::_ShowGetLockTimeout方法
然后将CDL.Dumps()的内容打印出来,就可以知道,当前哪些CDL.CheckDL的地方有死锁。
Dumps很详细,具体还可以根据自己的需要来对 Locker的信息做调整。
Dumps信息中,辨别哪些是死锁状态的,看:LockingEt的值是多少就知道了
LockingEt是:Locking Elapsed Time的意思,获取锁多长时间了
LockingEt值只要大于零,且接近于:CDL.DEAD_LOCK_TIME_OUT的值,都基本上是死锁
明显原来的代码逻辑会发生改变
多了一些方法调用
特别是将原来的代码的位置,改变了,放到了一个lambda(其实在IL中是个匿名函数)
先不说结构上变化了,效率也会有丢丢影响的。
还有一个方法可以去出副作用,就是再写个工具,将编译出来的DLL,批量处理CDL.CheckDL的IL代码,改为原来的lock(obj)方式,当然,前提是先需要大量测试后再用这个工具处理,不然如果中途还是出现了死锁的话,定位问题还是会很头疼的。(制作这个工具,理论上是可以的,但需要对IL熟悉)
CDL.DEAD_LOCK_TIME_OUT= 3000; // 这个阀值按需调整,实际的线上产品服务器程序如果负载过大时,可能也会有部分任务处理过久,导致‘取锁’等待过久,所以出现Monitor.TryEnter timeout时,不一定是死锁。
你可以按你的需求来调整该值,如:调整个:60000(60秒),意思是你确定了,取锁时间超时为60秒的,都算是有死锁任务导致
ThreadId:10
Name:CBCServerAliveLocker
LockedTimes:1
LockingTs:7496815
LockingEt:-7496815(ms // =========辨别死锁=======负数的都不用看
Enter:True
Exit:False
Tag:null
LockingStackTrace:
Program::Main()->
XXXServer::WaitForExit()->
CDL::CheckDL()
=line============
ThreadId:15
Name:CTcpNetworker
LockedTimes:486
LockingTs:7519248
LockingEt:2995(ms // =辨别死锁=此乃死锁也,接近CDL.DEAD_LOCK_TIME_OUT(3000 MS)的值,由于CBattleRoom的Locker线程ID为6的锁超时而导致的
Enter:True
Exit:False
Tag:null
LockingStackTrace:
_IOCompletionCallback::PerformIOCompletionCallback()->
BaseOverlappedAsyncResult::CompletionPortCallback()->
LazyAsyncResult::ProtectedInvokeCallback()->
ContextAwareResult::Complete()->
ExecutionContext::Run()->
ExecutionContext::Run()->
ExecutionContext::RunInternal()->
ContextAwareResult::CompleteCallback()->
LazyAsyncResult::Complete()->
XXXNetworker::_OnBeginReceiveCallback()->
CDL::CheckDL()
=line============
ThreadId:15
Name:CEventMgr
LockedTimes:247
LockingTs:7519248
LockingEt:-7519248(ms // =========辨别死锁=======负数的都不用看
Enter:True
Exit:False
Tag:null
LockingStackTrace:
_IOCompletionCallback::PerformIOCompletionCallback()->
BaseOverlappedAsyncResult::CompletionPortCallback()->
LazyAsyncResult::ProtectedInvokeCallback()->
ContextAwareResult::Complete()->
ExecutionContext::Run()->
ExecutionContext::Run()->
ExecutionContext::RunInternal()->
ContextAwareResult::CompleteCallback()->
LazyAsyncResult::Complete()->
XXXNetworker::_OnBeginReceiveCallback()->
CDL::CheckDL()->
<>c__DisplayClass78_0::<_OnBeginReceiveCallback>b__0()->
XXXConnection::XXXNetworker_OnPackageEvent()->
CEventMgr::Invoke()->
CDL::CheckDL()
=line============
ThreadId:6
Name:CBattleRoomMgr
LockedTimes:689
LockingTs:7519248
LockingEt:-7519248(ms // =========辨别死锁=======负数的都不用看
Enter:True
Exit:False
Tag:null
LockingStackTrace:
_ThreadPoolWaitCallback::PerformWaitCallback()->
ThreadPoolWorkQueue::Dispatch()->
Task::System.Threading.IThreadPoolWorkItem.ExecuteWorkItem()->
Task::ExecuteEntry()->
Task::ExecuteWithThreadLocal()->
ExecutionContext::Run()->
ExecutionContext::RunInternal()->
Task::ExecutionContextCallback()->
Task::Execute()->
Task::InnerInvoke()->
XXXServer::b__36_0()->
CDL::CheckDL()->
XXXServer::b__36_1()->
XXXServer::_DeadLockMethod()->
XXXServerInst::_DeadLockMethod1()->
CDL::CheckDL()
=line============
ThreadId:6
Name:CBattleRoom
LockedTimes:802
LockingTs:7519248
LockingEt:2995(ms // ==============辨别死锁======此乃死锁也,接近CDL.DEAD_LOCK_TIME_OUT(3000 MS)的值
Enter:True
Exit:False
Tag:BroadcastAll3 starting
LockingStackTrace:
_ThreadPoolWaitCallback::PerformWaitCallback()->
ThreadPoolWorkQueue::Dispatch()->
Task::System.Threading.IThreadPoolWorkItem.ExecuteWorkItem()->
Task::ExecuteEntry()->
Task::ExecuteWithThreadLocal()->
ExecutionContext::Run()->
ExecutionContext::RunInternal()->
Task::ExecutionContextCallback()->
Task::Execute()->
Task::InnerInvoke()->
XXXServer::b__36_0()->
CDL::CheckDL()->
XXXServer::b__36_1()->
XXXServer::_DeadLockMethod1()->
XXXServerInst::_DeadLockMethod2()->
CDL::CheckDL()->
XXXServerInst::<_DeadLockMethod2>b__19_0()->
XXXServerInst1::_DeadLockMethod1()->
CDL::CheckDL()->
XXXServerInst1::<_DeadLockMethod1>b__43_0()->
XXXServerInst1::_DeadLockMethod2()->
CDL::CheckDL()
=line============
ThreadId:6
Name:CBCServerNormalLocker
LockedTimes:668
LockingTs:7519248
LockingEt:-7519248(ms // =========辨别死锁=======负数的都不用看
Enter:True
Exit:False
Tag:null
LockingStackTrace:
_ThreadPoolWaitCallback::PerformWaitCallback()->
ThreadPoolWorkQueue::Dispatch()->
Task::System.Threading.IThreadPoolWorkItem.ExecuteWorkItem()->
Task::ExecuteEntry()->
Task::ExecuteWithThreadLocal()->
ExecutionContext::Run()->
ExecutionContext::RunInternal()->
Task::ExecutionContextCallback()->
Task::Execute()->
Task::InnerInvoke()->
XXXServer::b__36_0()->
CDL::CheckDL()
以上为旧版本的Dumps信息
在新版本的可以根据WaitQueue,还有LockedRecord可便于分析师由于之前那些地址出现死锁任务导致后续死锁
定位到哪里死锁之后,我们只要调整代码即可解决: