C# 解决死锁

处理这个死锁问题,花了好几天,相信遇到的同学,一样头疼,但有个好辅助类的话(好在.net的API足够强大),就没这么头疼了

注意

本篇文章的解决方案只适合使用lock(obj),或是:Monitor.Enter(obj); …. Monitor.Exit(obj)的方式
类似酱紫的死锁

如果你使用的是:AutoResetEvent.Set/Rest, Monitor.Wait/Pulse, Mutex的方式,请另寻他法。

辅助类

//------------------------------------------------------------------------
// Craeted by Jave.Lin 4/21/2018 5:31:57 PM
//------------------------------------------------------------------------

using System;
using System.Collections.Generic;
using System.Threading;

//namespace Common.ComUtil
//{
    /// 
    /// Locker Information
    /// author  :   Jave.Lin
    /// date    :   4/21/2018 5:31:57 PM
    /// 
    public class Locker
    {
        public object tag; // 附带上下文数据

        public int threadID; // 获取锁的线程ID
        public string name; // 锁的名称
        public int lockedTimes; // 累积获取锁多少次,便于分析死锁几率
        public int lockingTs; // 获取锁那刻的时间戳
        public bool enter; // 获取锁的标记
        public bool exit; // 释放锁的标记
        public string lockingStackTrace; // 获取锁那刻的调用方法栈,下文Dumps信息实例可以看到很清楚

        public int lockingDLTs; // 识别为死锁时设置的时间戳

        public List pRecordList;

        public Locker(string name)
        {
            this.name = name;

            pRecordList = new List();
        }
        public void PushRecord()
        {
            pRecordList.Add(RetrieveRecord());
        }
        public void ClearRecord()
        {
            pRecordList.Clear();
        }
        public Locker RetrieveRecord()
        {
            var ret = new Locker(this.name);
            ret.tag = tag;
            ret.threadID = threadID;
            ret.name = name;
            ret.lockedTimes = lockedTimes;
            ret.lockingTs = lockingTs;
            ret.enter = enter;
            ret.exit = exit;
            ret.lockingStackTrace = lockingStackTrace;
            ret.lockingDLTs = lockingDLTs;
            return ret;
        }
        public override string ToString()
        {
            string[] strs = new string[]
            {
                "ThreadId:" + threadID,
                "Name:" + name,
                "LockedTimes:" + lockedTimes,
                "LockingTs:" + lockingTs,
                "LocingEt:" + (lockingDLTs - lockingTs) + ("(ms"),
                "Enter:" + enter,
                "Exit:" + exit,
                "Tag:" + (tag != null ? tag.ToString() : "null"),
                "LockingStackTrace:\n" + lockingStackTrace,
                (pRecordList.Count > 0 ? "LockedRecord:\n\t" + string.Join("\t-record--------------", pRecordList) : "")
            };
            return string.Join("\r\n", strs);
        }
    }
    public delegate void OnDL();
    /// 
    /// Check Dead Lock (CDL)
    /// author  :   Jave.Lin
    /// date    :   4/21/2018 5:31:57 PM
    /// 
    public static class CDL
    {
        // had been locked in map
        public static readonly Dictionarybool> _s_pLockedMap = new Dictionarybool>();
        // the last 
        public static readonly Dictionary> _s_pLockingMap = new Dictionary>();

        public const bool THROW_ER = true;
        public const int DEAD_LOCK_TIME_OUT = 3000; // 这个阀值按需调整,实际的线上产品服务器程序如果负载过大时,可能也会有部分任务处理过久,导致‘取锁’等待过久

    public static event OnDL OnDLEvent;

        private static void _PushToWaitQueue(Locker locker)
        {
            List list = null;
            if (!_s_pLockingMap.TryGetValue(locker, out list))
            {
                list = new List();
                _s_pLockingMap[locker] = list;
            }
            list.Add(locker.RetrieveRecord());
        }

        private static void _ClearFromLocking(Locker locker)
        {
            List list = null;
            if (_s_pLockingMap.TryGetValue(locker, out list))
            {
                list.Clear();
                _s_pLockingMap.Remove(locker);
            }
        }

        private static void _BeforeEnter(Locker locker)
        {
            if (locker.enter)
            {
                locker.PushRecord();
                _PushToWaitQueue(locker);
            }
        }
        private static void _Enter(Locker locker)
        {
            locker.enter = true;
            locker.exit = false;
            locker.lockingTs = Environment.TickCount;
            locker.threadID = Thread.CurrentThread.ManagedThreadId;
            locker.lockingStackTrace = GetCurStackTrace("->\n");
            Interlocked.Increment(ref locker.lockedTimes);

            _s_pLockedMap[locker] = true;
        }
        private static void _Exit(Locker locker)
        {
            if (!Monitor.IsEntered(locker))
            {
                locker.lockingTs = Environment.TickCount;
                var msg = "!Monitor.IsEntered(locker)";
                if (THROW_ER)
                {
                    throw new Exception(msg);
                }
                else
                {
                    _WarningWriteLine(msg);
                }
            }
            else
            {
                locker.exit = true;
                _s_pLockedMap.Remove(locker);
                _ClearFromLocking(locker);
                locker.ClearRecord();
                Monitor.Exit(locker);
            }
        }
        private static string _GetWaitQueue(Locker locker)
        {
            if (_s_pLockingMap.ContainsKey(locker))
            {
                return string.Join("\n@@@@@", _s_pLockingMap[locker]);
            }
            return "";
        }
        public static string Dumps()
        {
            var itemList = new List();
            var contentList = new List<string>();
            foreach (var item in _s_pLockedMap)
            {
                itemList.Add(item.Key);
            }
            itemList.Sort((a, b) =>
            {
                return (b.lockingDLTs - b.lockingTs) - (a.lockingDLTs - a.lockingTs);
            });
            foreach (var item in itemList)
            {
                contentList.Add(item.ToString() + "\n$$$$$$$$$$Before Locking WaitQueue$$$$$$$$\n" + _GetWaitQueue(item));
            }
            return string.Join("\r\n=line============\r\n", contentList);
        }

        public static void CheckDL(Locker locker, Action actoin)
        {
            try
            {
                _BeforeEnter(locker);
                if (Monitor.TryEnter(locker, DEAD_LOCK_TIME_OUT))
                {
                    _Enter(locker);

                    actoin.Invoke();
                }
                else
                {
                    locker.lockingDLTs = Environment.TickCount;
                    _WarningWriteLine("TryEnter time out");
                    if (THROW_ER)
                    {
                        _ShowGetLockTimeout();
                    }
                    else
                    {
                        actoin.Invoke();
                    }
                }
            }
            catch (Exception e)
            {
                _ErrorWriteLine(e.ToString());
            }
            finally
            {
                _Exit(locker);
            }
        }
        public static T CheckDL(Locker locker, Func actoin)
        {
            T ret = default(T);
            try
            {
                _BeforeEnter(locker);
                if (Monitor.TryEnter(locker, DEAD_LOCK_TIME_OUT))
                {
                    _Enter(locker);

                    ret = actoin.Invoke();
                }
                else
                {
                    locker.lockingDLTs = Environment.TickCount;
                    _WarningWriteLine("TryEnter time out");
                    if (THROW_ER)
                    {
                        _ShowGetLockTimeout();
                    }
                    else
                    {
                        actoin.Invoke();
                    }
                }
            }
            catch (Exception e)
            {
                _ErrorWriteLine(e.ToString());
            }
            finally
            {
                _Exit(locker);
            }
            return ret;
        }
        public static string GetCurStackTrace(string separactor = "->")
        {
            System.Diagnostics.StackTrace st = new System.Diagnostics.StackTrace();
            System.Diagnostics.StackFrame[] sfs = st.GetFrames();
            List<string> methodNameList = new List<string>();
            for (int i = 1; i < sfs.Length; ++i)
            {
                if (System.Diagnostics.StackFrame.OFFSET_UNKNOWN == sfs[i].GetILOffset()) break;
                var m = sfs[i].GetMethod();
                var dn = m.DeclaringType.Name;
                var mn = m.Name;
                methodNameList.Add(new string(' ', sfs.Length - i) + dn + "::" + mn + "()");
            }
            methodNameList.Reverse();

            return string.Join(separactor, methodNameList);
        }
        private static void _ShowGetLockTimeout()
        {
            OnDLEvent?.Invoke();
            var msg = "!!!!!!!!!!!!!!!!!!DeadLock!!!!!!!!!!!!!!!!!!!!!!!!!!";
            _ErrorWriteLine(msg);
            throw new Exception(msg);
        }
        private static void _ErrorWriteLine(string msg, params object[] args)
        {
            var srcColor = Console.ForegroundColor;
            Console.ForegroundColor = ConsoleColor.Red;
            Console.WriteLine(msg, args);
            Console.ForegroundColor = srcColor;
        }
        private static void _WarningWriteLine(string msg, params object[] args)
        {
            var srcColor = Console.ForegroundColor;
            Console.ForegroundColor = ConsoleColor.Yellow;
            Console.WriteLine(msg, args);
            Console.ForegroundColor = srcColor;
        }
    }
//}

使用方法要点、注意副作用、处理副作用

使用方法要点

// 我们平常用的:
lock(obj)
{
 // code here
}

改写成:
CDL.CheckDL(obj, ()=>
{
 // code here
});

// 怎么方便改,是个问题
// 用到的lock少的话,手动一个个改吧
// 如果巨量的话,建议CTRL + SHIFT + H来批量替换Lock的代码吧(写个正则)
// 将:CDL的namespace去掉,这样就不用导namespace了。

一旦有死锁出现,那么将会命中CDL::_ShowGetLockTimeout方法
然后将CDL.Dumps()的内容打印出来,就可以知道,当前哪些CDL.CheckDL的地方有死锁。
Dumps很详细,具体还可以根据自己的需要来对 Locker的信息做调整。

从dumps信息中辨别死锁

Dumps信息中,辨别哪些是死锁状态的,看:LockingEt的值是多少就知道了
LockingEt是:Locking Elapsed Time的意思,获取锁多长时间了
LockingEt值只要大于零,且接近于:CDL.DEAD_LOCK_TIME_OUT的值,都基本上是死锁

注意副作用

明显原来的代码逻辑会发生改变
多了一些方法调用
特别是将原来的代码的位置,改变了,放到了一个lambda(其实在IL中是个匿名函数)
先不说结构上变化了,效率也会有丢丢影响的。

处理副作用

还有一个方法可以去出副作用,就是再写个工具,将编译出来的DLL,批量处理CDL.CheckDL的IL代码,改为原来的lock(obj)方式,当然,前提是先需要大量测试后再用这个工具处理,不然如果中途还是出现了死锁的话,定位问题还是会很头疼的。(制作这个工具,理论上是可以的,但需要对IL熟悉)

注意CDL.DEAD_LOCK_TIME_OUT

CDL.DEAD_LOCK_TIME_OUT= 3000; // 这个阀值按需调整,实际的线上产品服务器程序如果负载过大时,可能也会有部分任务处理过久,导致‘取锁’等待过久,所以出现Monitor.TryEnter timeout时,不一定是死锁。
你可以按你的需求来调整该值,如:调整个:60000(60秒),意思是你确定了,取锁时间超时为60秒的,都算是有死锁任务导致

CDL.Dumps内容实例

ThreadId:10
Name:CBCServerAliveLocker
LockedTimes:1
LockingTs:7496815
LockingEt:-7496815(ms // =========辨别死锁=======负数的都不用看
Enter:True
Exit:False
Tag:null
LockingStackTrace:
         Program::Main()->
          XXXServer::WaitForExit()->
           CDL::CheckDL()
=line============
ThreadId:15
Name:CTcpNetworker
LockedTimes:486
LockingTs:7519248
LockingEt:2995(ms // =辨别死锁=此乃死锁也,接近CDL.DEAD_LOCK_TIME_OUT(3000 MS)的值,由于CBattleRoom的Locker线程ID为6的锁超时而导致的
Enter:True
Exit:False
Tag:null
LockingStackTrace:
 _IOCompletionCallback::PerformIOCompletionCallback()->
  BaseOverlappedAsyncResult::CompletionPortCallback()->
   LazyAsyncResult::ProtectedInvokeCallback()->
    ContextAwareResult::Complete()->
     ExecutionContext::Run()->
      ExecutionContext::Run()->
       ExecutionContext::RunInternal()->
        ContextAwareResult::CompleteCallback()->
         LazyAsyncResult::Complete()->
          XXXNetworker::_OnBeginReceiveCallback()->
           CDL::CheckDL()
=line============
ThreadId:15
Name:CEventMgr
LockedTimes:247
LockingTs:7519248
LockingEt:-7519248(ms // =========辨别死锁=======负数的都不用看
Enter:True
Exit:False
Tag:null
LockingStackTrace:
 _IOCompletionCallback::PerformIOCompletionCallback()->
  BaseOverlappedAsyncResult::CompletionPortCallback()->
   LazyAsyncResult::ProtectedInvokeCallback()->
    ContextAwareResult::Complete()->
     ExecutionContext::Run()->
      ExecutionContext::Run()->
       ExecutionContext::RunInternal()->
        ContextAwareResult::CompleteCallback()->
         LazyAsyncResult::Complete()->
          XXXNetworker::_OnBeginReceiveCallback()->
           CDL::CheckDL()->
            <>c__DisplayClass78_0::<_OnBeginReceiveCallback>b__0()->
             XXXConnection::XXXNetworker_OnPackageEvent()->
              CEventMgr::Invoke()->
               CDL::CheckDL()
=line============
ThreadId:6
Name:CBattleRoomMgr
LockedTimes:689
LockingTs:7519248
LockingEt:-7519248(ms // =========辨别死锁=======负数的都不用看
Enter:True
Exit:False
Tag:null
LockingStackTrace:
 _ThreadPoolWaitCallback::PerformWaitCallback()->
  ThreadPoolWorkQueue::Dispatch()->
   Task::System.Threading.IThreadPoolWorkItem.ExecuteWorkItem()->
    Task::ExecuteEntry()->
     Task::ExecuteWithThreadLocal()->
      ExecutionContext::Run()->
       ExecutionContext::RunInternal()->
        Task::ExecutionContextCallback()->
         Task::Execute()->
          Task::InnerInvoke()->
           XXXServer::b__36_0()->
            CDL::CheckDL()->
             XXXServer::b__36_1()->
              XXXServer::_DeadLockMethod()->
               XXXServerInst::_DeadLockMethod1()->
                CDL::CheckDL()
=line============
ThreadId:6
Name:CBattleRoom
LockedTimes:802
LockingTs:7519248
LockingEt:2995(ms // ==============辨别死锁======此乃死锁也,接近CDL.DEAD_LOCK_TIME_OUT(3000 MS)的值
Enter:True
Exit:False
Tag:BroadcastAll3 starting
LockingStackTrace:
 _ThreadPoolWaitCallback::PerformWaitCallback()->
  ThreadPoolWorkQueue::Dispatch()->
   Task::System.Threading.IThreadPoolWorkItem.ExecuteWorkItem()->
    Task::ExecuteEntry()->
     Task::ExecuteWithThreadLocal()->
      ExecutionContext::Run()->
       ExecutionContext::RunInternal()->
        Task::ExecutionContextCallback()->
         Task::Execute()->
          Task::InnerInvoke()->
           XXXServer::b__36_0()->
            CDL::CheckDL()->
             XXXServer::b__36_1()->
              XXXServer::_DeadLockMethod1()->
               XXXServerInst::_DeadLockMethod2()->
                CDL::CheckDL()->
                 XXXServerInst::<_DeadLockMethod2>b__19_0()->
                  XXXServerInst1::_DeadLockMethod1()->
                   CDL::CheckDL()->
                    XXXServerInst1::<_DeadLockMethod1>b__43_0()->
                     XXXServerInst1::_DeadLockMethod2()->
                      CDL::CheckDL()
=line============
ThreadId:6
Name:CBCServerNormalLocker
LockedTimes:668
LockingTs:7519248
LockingEt:-7519248(ms // =========辨别死锁=======负数的都不用看
Enter:True
Exit:False
Tag:null
LockingStackTrace:
 _ThreadPoolWaitCallback::PerformWaitCallback()->
  ThreadPoolWorkQueue::Dispatch()->
   Task::System.Threading.IThreadPoolWorkItem.ExecuteWorkItem()->
    Task::ExecuteEntry()->
     Task::ExecuteWithThreadLocal()->
      ExecutionContext::Run()->
       ExecutionContext::RunInternal()->
        Task::ExecutionContextCallback()->
         Task::Execute()->
          Task::InnerInvoke()->
           XXXServer::b__36_0()->
            CDL::CheckDL()

以上为旧版本的Dumps信息
在新版本的可以根据WaitQueue,还有LockedRecord可便于分析师由于之前那些地址出现死锁任务导致后续死锁

定位死锁后,调整代码思路

定位到哪里死锁之后,我们只要调整代码即可解决:

  • 解除死锁闭环嵌套问题:考虑锁对象的更换(另建一个Locker、实在不能换Locker,就延迟处理:如比较典型的是队列处理(任务的进、出、轮询遍历任务都是同一个锁))
  • 要不要加锁(有没必要加)
  • 检测到死锁可对Locker扩展(判断如果Locker被锁了,添加AutoResetEvent.WaitOne等待,每个Locker的Exit是,调用一下AutoResetEvent.Set去唤醒之前WaitOne的Locker)

你可能感兴趣的:(C#,服务端)