近期一直处理客户服务器宕机问题,对于系统出现的MCE代码,或者客服收集的CPU底层寄存器的都要人工排查那些寄存器需要分析,然后找到分析寄存器进行16进制高低位转换,再转为二进制代码,然后对应intel 开发手册或者是CPU DataSheet去解读信息,非常麻烦,费事费力,更不用说客户运维去查。 其实这一系列查询并不复杂,只是耗费时间,在这个AI时代,其实何不自己做一个工具,一键抓取有用的寄存器,并自己通过读取数据库里面数据马上解读出来,说做就做,用EXCEL VBA就可以轻松实现,后期也可以方便添加寄存器信息,而且客户使用也方便不用安装任何APP,不需要记住Linux命令,点点鼠标就可以查看到了。
一种用带外工具收集CPU寄存器如下,各个厂商差不多,CSR寄存器为32为,如下后面00 00 b8 00 为寄存器的信息,
(Bus:1 Dev:30 Fun:2 Reg:0xEC ) CPU0 MCA_ERR_SRC_LOG 57 01 00 40 00 00 b8 00
(Bus:0 Dev:5 Fun:2 Reg:0xA0 ) CPU0 viral 57 01 00 40 00 00 00 00
(Bus:0 Dev:5 Fun:2 Reg:0x1C4 ) CPU0 gferrst 57 01 00 40 00 00 00 00
(Bus:0 Dev:5 Fun:2 Reg:0x1CC ) CPU0 gsysst 57 01 00 40 00 00 00 00
(Bus:0 Dev:5 Fun:2 Reg:0x1DC ) CPU0 gfferrst 57 01 00 40 00 00 00 00
另外MCE /MSR 为64位,通过读取后面8字节寄存器值查询对应intel文档就可以知道机器发生过什么问题。
CPU0_Proc14 IA32_MC2_STATUS 57 01 00 40 00 00 00 00 00 00 00 00
CPU0_Proc15 IA32_MC2_STATUS 57 01 00 40 00 00 00 00 00 00 00 00
CPU0_Proc16 IA32_MC2_STATUS 57 01 00 40 00 00 00 00 00 00 00 00
CPU0_Proc17 IA32_MC2_STATUS 57 01 00 40 00 00 00 00 00 00 00 00
系统也会报出MCE 日志,系统报格式如下,直接贴到EXCEL 就可以解释了。
MC1_STATUS: 0xf200000000020151
一.双击底层寄存器解释
二.选择底层寄存器文件
解释CSR寄存器(范例定位PCIE错误)
解释MCE信息(范例定位内存Channel B出错)
维护CSR 表格数据,自己可以根据CPU换代,增加相应数据到CSR页面
上代码:
Function bin_d(content As String) As String '格式化二进制输出
' For k = 0 To Len(content) - 1
' If k Mod 4 = 0 Then
' me1 = me1 & " " & Mid(content, k + 1, 1)
' Else
' me1 = me1 & Mid(content, k, 1)
' End If
' Next
' bin_d = me1
bin_d = content
End Function
Function mem_c(content As String) As String ' 内存Channel解码
If StrComp(content, "0000", vbTextCompare) = 0 Then
mem_c = "[Channel A]"
ElseIf StrComp(content, "0001", vbTextCompare) = 0 Then
mem_c = "[Channel B]"
ElseIf StrComp(content, "0010", vbTextCompare) = 0 Then
mem_c = "[Channel C]"
ElseIf StrComp(content, "0011", vbTextCompare) = 0 Then
mem_c = "[Channel D]"
ElseIf StrComp(content, "0100", vbTextCompare) = 0 Then
mem_c = "[Channel E]"
ElseIf StrComp(content, "0101", vbTextCompare) = 0 Then
mem_c = "[Channel F]"
ElseIf StrComp(content, "0110", vbTextCompare) = 0 Then
mem_c = "[Channel G]"
ElseIf StrComp(content, "0111", vbTextCompare) = 0 Then
mem_c = "[Channel H]"
End If
End Function
Function rrrr(content As String) As String 'rrrr值解码
If StrComp(content, "0000", vbTextCompare) = 0 Then
rrrr = "[Generic error]"
ElseIf StrComp(content, "0001", vbTextCompare) = 0 Then
rrrr = "[Generic read]"
ElseIf StrComp(content, "0010", vbTextCompare) = 0 Then
rrrr = "[Generic write]"
ElseIf StrComp(content, "0011", vbTextCompare) = 0 Then
rrrr = "[Data read]"
ElseIf StrComp(content, "0100", vbTextCompare) = 0 Then
rrrr = "[Data write]"
ElseIf StrComp(content, "0101", vbTextCompare) = 0 Then
rrrr = "[Instruction fetch]"
ElseIf StrComp(content, "0110", vbTextCompare) = 0 Then
rrrr = "[Prefetch]"
ElseIf StrComp(content, "0111", vbTextCompare) = 0 Then
rrrr = "[Evict]"
ElseIf StrComp(content, "1000", vbTextCompare) = 0 Then
rrrr = "[Snoop (probe)]"
End If
End Function
Function ll(content As String) As String 'LL值解码
'MsgBox content
If StrComp(content, "00", vbTextCompare) = 0 Then
ll = "[Level 0]"
ElseIf StrComp(content, "01", vbTextCompare) = 0 Then
ll = "[level 1]"
ElseIf StrComp(content, "10", vbTextCompare) = 0 Then
ll = "[level 2]"
ElseIf StrComp(content, "11", vbTextCompare) = 0 Then
ll = "[Generic]"
End If
'MsgBox mem_c
End Function
Function tt(content As String) As String
If StrComp(content, "00", vbTextCompare) = 0 Then
tt = "[Instruction]"
ElseIf StrComp(content, "01", vbTextCompare) = 0 Then
tt = "[Data]"
ElseIf StrComp(content, "10", vbTextCompare) = 0 Then
tt = "[Generic]"
ElseIf StrComp(content, "11", vbTextCompare) = 0 Then
tt = "[Reserved]"
End If
End Function
Function pp(content As String) As String
If StrComp(content, "00", vbTextCompare) = 0 Then
pp = "[Local node originated the request.]"
ElseIf StrComp(content, "01", vbTextCompare) = 0 Then
pp = "[Local node responded to the request.]"
ElseIf StrComp(content, "10", vbTextCompare) = 0 Then
pp = "[Local node observed error as third-party.]"
ElseIf StrComp(content, "11", vbTextCompare) = 0 Then
pp = "[Generic]"
End If
End Function
Function ii(content As String) As String
If StrComp(content, "00", vbTextCompare) = 0 Then
ii = "[Memory access]"
ElseIf StrComp(content, "01", vbTextCompare) = 0 Then
ii = "[Reserved]"
ElseIf StrComp(content, "10", vbTextCompare) = 0 Then
ii = "[I/O]"
ElseIf StrComp(content, "11", vbTextCompare) = 0 Then
ii = "[Other]"
End If
End Function
Function t(content As String) As String
If StrComp(content, "0", vbTextCompare) = 0 Then
t = "[Request did not timeout.]"
ElseIf StrComp(content, "1", vbTextCompare) = 0 Then
t = "[Request did timeout.]"
End If
End Function
Function decode_c(title As String, content As String) As String
For cl = 1 To Sheets("csr").UsedRange.Columns.Count
If InStr(1, Sheets("csr").Cells(1, cl), Trim(title)) Then
Dim Strl As String
Strl = Hex2Bin(content)
Dim i
For i = 1 To Len(Strl)
rebin = Mid(Strl, i, 1) & rebin
Next
For k = 3 To 21
pos = Sheets("CSR").Cells(k, cl).Value
If Mid(rebin, Sheets("CSR").Cells(k, cl).Value + 1, 1) = "1" Then
csrstr = csrstr & vbCrLf & Sheets("CSR").Cells(k, cl + 1).Value
decode_c = "二进制值为" & bin_d(Strl) & vbCrLf & "解释为:" & csrstr
End If
Next k
Exit For
Else
'MsgBox "not found"
decode_c = "寄存器数据暂未添加,请在CSR页面添加"
End If
Next cl
'decode_c = content & title
End Function
Function decode_m(title As String, content As String) As String
If InStr(1, UCase(title), "STATUS", vbBinaryCompare) > 1 Then
Dim Strl As String
Strl = Hex2Bin(content)
Dim i
For i = 1 To Len(Strl)
rebin = Mid(Strl, i, 1) & rebin
Next
For k = 1 To 7
'pos = Sheets("CSR").Cells(k, cl).Value
If Mid(Strl, k, 1) = "1" Then
mcestr = mcestr & vbCrLf & Sheets("MCE").Cells(k + 20, 1).Value
decode_m = "二进制值为" & bin_d(Strl) & vbCrLf & "解释为:" & mcestr
End If
Next k
For k = 53 To 64
bin = bin & Mid(Strl, k, 1)
' pos = InStr(1, bin, "1", vbTextCompare)
'MsgBox pos
'pos = Sheets("CSR").Cells(k, cl).Value
' If Mid(Strl, k, 1) = "1" Then
' mcestr = mcestr & vbCrLf & Sheets("MCE").Cells(k + 20, 1).Value
' decode_m = "二进制值为" & Strl & vbCrLf & "解释为:" & mcestr
' End If
Next k
' MsgBox bin
pos = InStr(1, bin, "1", vbTextCompare)
Select Case (Int(pos))
Case 0
decode_m = decode_m & vbCrLf & "No error has been reported to this bank."
Case 12
decode_m = decode_m & vbCrLf & "Unclassified. This error has not been classified into the MCA error classes. The additional information section may have meaning."
Case 11
If Mid(bin, 12, 1) = "0" Then
decode_m = decode_m & vbCrLf & "Parity error in internal microcode ROM"
Else
decode_m = decode_m & vbCrLf & " The BINT# from another processor caused this processor to enter machine-check."
End If
Case 10
If Mid(bin, 12, 1) = "0" Then
decode_m = decode_m & vbCrLf & "Functional redundancy check (FRC) master/slave error."
Else
decode_m = decode_m & vbCrLf & "Internal parity error."
End If
Case 9
If Mid(bin, 10, 1) = "1" Then
decode_m = decode_m & vbCrLf & " Generic cache hierarchy errors."
End If
For kk = 11 To 12
me1 = me1 & Mid(bin, kk, 1)
Next
llc = ll(CStr(me1))
decode_m = decode_m & vbCrLf & llc
Case 8
decode_m = decode_m & vbCrLf & " TLB errors."
For kk = 11 To 12
me1 = me1 & Mid(bin, kk, 1)
Next
llc = ll(CStr(me1))
decode_m = decode_m & vbCrLf & llc
For jj = 9 To 10
me1 = me1 & Mid(bin, jj, 1)
Next
ttc = tt(CStr(me1))
decode_m = decode_m & vbCrLf & ttc
Case 5
decode_m = decode_m & vbCrLf & "Memory controller errors (Intel-only)."
For kk = 6 To 8
me1 = me1 & Mid(bin, kk, 1)
Next
'MsgBox me1
If StrComp(me1, "000", vbTextCompare) = 0 Then
decode_m = decode_m & vbCrLf & "[ Generic undefined request]"
ElseIf StrComp(me1, "001", vbTextCompare) = 0 Then
decode_m = decode_m & vbCrLf & " [ memory read error]"
ElseIf StrComp(me1, "010", vbTextCompare) = 0 Then
decode_m = decode_m & vbCrLf & " [Memory write error.]"
ElseIf StrComp(me1, "011", vbTextCompare) = 0 Then
decode_m = decode_m & vbCrLf & " [Address or command error.]"
ElseIf StrComp(me1, "100", vbTextCompare) = 0 Then
decode_m = decode_m & vbCrLf & "[ Memory scrubbing error.]"
End If
For jj = 9 To 12
me2 = me2 & Mid(bin, jj, 1)
Next
mem_cn = mem_c(CStr(me2))
decode_m = decode_m & vbCrLf & mem_cn
Case 4
decode_m = decode_m & vbCrLf & "Memory errors in the cache hierarchy."
For kk = 5 To 8
me1 = me1 & Mid(bin, kk, 1)
Next
rrrr_c = rrrr(CStr(me1))
decode_m = decode_m & vbCrLf & rrrr_c
For kk2 = 11 To 12
me2 = me2 & Mid(bin, kk2, 1)
Next
llc = ll(CStr(me2))
decode_m = decode_m & vbCrLf & llc
For kk3 = 9 To 10
me3 = me3 & Mid(bin, kk3, 1)
Next
ttc = tt(CStr(me3))
decode_m = decode_m & vbCrLf & ttc
Case 2
If Mid(bin, 12, 1) = "0" Then
decode_m = decode_m & vbCrLf & "Internal timer error."
Else
decode_m = decode_m & vbCrLf & " Internal unclassified error. At least one x equals 1"
End If
Case 1
decode_m = decode_m & vbCrLf & "Bus and interconnect errors."
For kk1 = 2 To 3 '检查PP值
me1 = me1 & Mid(bin, kk, 1)
Next
ppc = pp(CStr(me1))
decode_m = decode_m & vbCrLf & ppc
For kk2 = 4 To 4 '检查T值
me2 = me2 & Mid(bin, kk2, 1)
Next
tc = t(CStr(me2))
decode_m = decode_m & vbCrLf & tc
For kk3 = 5 To 8
me3 = me3 & Mid(bin, kk3, 1)
Next
rrrr_c = rrrr(CStr(me3))
decode_m = decode_m & vbCrLf & rrrr_c
For kk4 = 9 To 10
me4 = me4 & Mid(bin, kk4, 1)
Next
iic = ii(CStr(me4))
decode_m = decode_m & vbCrLf & iic
For kk5 = 11 To 12
me5 = me5 & Mid(bin, kk5, 1)
Next
llc = ll(CStr(me5))
decode_m = decode_m & vbCrLf & llc
End Select
' MsgBox pos
'decode_m = "二进制值为" & Strl & "MCE STatus decode"
Else
decode_m = "暂无数据"
End If
End Function
Function Hex2Bin(TP As String) As String '处理连串的16进制转化为2进制
Dim TP2() As String
L = Len(TP)
ReDim TP2(1 To L)
Dim P As String
P = "" '初始化为空字符串
For i = 1 To L
TP2(i) = Mid(TP, i, 1)
TP2(i) = Hex2BinSingleByte(TP2(i))
P = P + TP2(i)
Next i
Hex2Bin = P
End Function
Function Hex2BinSingleByte(H As String) As String
H = UCase(H)
Dim B As String
Select Case H
Case "0"
B = "0000"
Case "1"
B = "0001"
Case "2"
B = "0010"
Case "3"
B = "0011"
Case "4"
B = "0100"
Case "5"
B = "0101"
Case "6"
B = "0110"
Case "7"
B = "0111"
Case "8"
B = "1000"
Case "9"
B = "1001"
Case "A"
B = "1010"
Case "B"
B = "1011"
Case "C"
B = "1100"
Case "D"
B = "1101"
Case "E"
B = "1110"
Case "F"
B = "1111"
End Select
Hex2BinSingleByte = B
End Function
Private Sub Worksheet_SelectionChange(ByVal Target As Range)
'If Target.Column = 2 And Target.Row >= 2 And Target.Value <> "" Then
' MsgBox "Can not update this value,if update need inform TPM."
' ActiveSheet.Cells(1, 5).Select
'End If
If Target.Column = 1 And Target.Row > 1 Then '
On Error Resume Next
If Target.Value = "" Then
Exit Sub
ElseIf Right(Target.Value, 1) = "P" Then
Target.Value = Left(Target.Value, Len(Target.Value) - 1)
ActiveSheet.Range("A1").Select
Else
Key1 = MsgBox("是否查询文档", vbOKCancel, "提示")
If Key1 = 1 Then
hexd = Split(Trim(Cells(Target.Row, Target.Column)), " ")
For j = LBound(hexd) To UBound(hexd)
Next j
'MsgBox hexd(j - 1)
If InStr(1, CStr(Trim(hexd(j - 1))), "IA32", vbTextCompare) > 0 Then
co = decode_m(CStr(hexd(j - 1)), Cells(Target.Row, Target.Column + 1))
Cells(Target.Row, Target.Column + 2) = co
Else
co = decode_c(CStr(Trim(hexd(j - 1))), Cells(Target.Row, Target.Column + 1))
Cells(Target.Row, Target.Column + 2) = co
End If
' MsgBox co
Target.Value = Target.Value & "P"
With Target.Characters(Start:=Len(Target.Value), Length:=1).Font
.Name = "Wingdings 2" '这个字体
.Size = 20 '字号
.Strikethrough = False
.Superscript = False
.Subscript = False
.OutlineFont = False
.Shadow = False
.Underline = xlUnderlineStyleNone '加粗
.ColorIndex = 23
.Bold = True
End With
End If
End If
End If
End Sub
MCE和CSR数值参考文档:
项目 |
参考文档 |
出处 |
MCE |
64-ia-32-architectures-software-developer-system-programming-manual-325384 |
https://software.intel.com/en-us/articles/intel-sdm |
CSR |
xeon-e5-v4-datasheet-vol-2 |
https://www.intel.cn/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v4-datasheet-vol-2.pdf |