今天需要将多个Excel文档转换为XML格式,本打算用MS Office自带的另存为XML文件的功能,结果转换成MS Office2003 XML之后的文件就是一坨屎!Office 2007自带的XML文档转换的功能也TMD超级繁琐,根据帮助手册自己建了.xsd文件导入到Excel之后也无法导出XML数据,白白浪费了时间。
后来Google到了这篇文章。文中提供了现成的VBA源代码,稍微修改一下即可拿来使用(中文注释为本人所加):
'Attribute VB_Name = "XL_to_XML"
Sub MakeXML()
' create an XML file from an Excel table
Dim MyRow As Integer, MyCol As Integer, Temp As String, YesNo As Variant, DefFolder As String
Dim XMLFileName As String, XMLRecSetName As String, MyLF As String, RTC1 As Integer
Dim RangeOne As String, RangeTwo As String, Tt As String, FldName(99) As String
MyLF = Chr(10) & Chr(13) ' a line feed command
DefFolder = "C:\MyXML\" 'change this to the location of saved XML files
YesNo = MsgBox("This procedure requires the following data:" & MyLF _
& "1 A filename for the XML file" & MyLF _
& "2 A groupname for an XML record" & MyLF _
& "3 A cellrange containing fieldnames (col titles)" & MyLF _
& "4 A cellrange containing the data table" & MyLF _
& "Are you ready to proceed?", vbQuestion + vbYesNo, "MakeXML CiM")
If YesNo = vbNo Then
Debug.Print "User aborted with 'No'"
Exit Sub
End If
XMLFileName = FillSpaces(InputBox("1. Enter the name of the XML file:", "MakeXML CiM", "GRE_Core_Words"))
If Right(XMLFileName, 4) <> ".xml" Then
XMLFileName = XMLFileName & ".xml"
End If
XMLRecSetName = FillSpaces(InputBox("2. Enter an identifying name of a record:", "MakeXML CiM", "item"))
RangeOne = InputBox("3. Enter the range of cells containing the field names (or column titles):", "MakeXML CiM", "A2:D2")
If MyRng(RangeOne, 1) <> MyRng(RangeOne, 2) Then
MsgBox "Error: names must be on a single row" & MyLF & "Procedure STOPPED", vbOKOnly + vbCritical, "MakeXML CiM"
Exit Sub
End If
MyRow = MyRng(RangeOne, 1)
For MyCol = MyRng(RangeOne, 3) To MyRng(RangeOne, 4)
If Len(Cells(MyRow, MyCol).Value) = 0 Then
MsgBox "Error: names range contains blank cell" & MyLF & "Procedure STOPPED", vbOKOnly + vbCritical, "MakeXML CiM"
Exit Sub
End If
FldName(MyCol - MyRng(RangeOne, 3)) = FillSpaces(Cells(MyRow, MyCol).Value)
Next MyCol
RangeTwo = InputBox("4. Enter the range of cells containing the data table:", "MakeXML CiM", "A3:D6257")
If MyRng(RangeOne, 4) - MyRng(RangeOne, 3) <> MyRng(RangeTwo, 4) - MyRng(RangeTwo, 3) Then
MsgBox "Error: number of field names <> data columns" & MyLF & "Procedure STOPPED", vbOKOnly + vbCritical, "MakeXML CiM"
Exit Sub
End If
RTC1 = MyRng(RangeTwo, 3)
If InStr(1, XMLFileName, ":\") = 0 Then
XMLFileName = DefFolder & XMLFileName
End If
Open XMLFileName For Output As #1
Print #1, ""
Print #1, ""
For MyRow = MyRng(RangeTwo, 1) To MyRng(RangeTwo, 2)
Print #1, "<" & XMLRecSetName & ">"
For MyCol = RTC1 To MyRng(RangeTwo, 4)
' the next line uses the FormChk function to format dates and numbers
Print #1, "<" & FldName(MyCol - RTC1) & ">" & Cells(MyRow, MyCol).Value & "" & FldName(MyCol - RTC1) & ">"
' the next line does not apply any formatting
' Print #1, "<" & FldName(MyCol - RTC1) & ">" & RemoveAmpersands(Cells(MyRow, MyCol).Value) & "" & FldName(MyCol - RTC1) & ">"
Next MyCol
Print #1, "" & XMLRecSetName & ">"
Next MyRow
Print #1, " "
Close #1
MsgBox XMLFileName & " created." & MyLF & "Process finished", vbOKOnly + vbInformation, "MakeXML CiM"
Debug.Print XMLFileName & " saved"
End Sub
Function MyRng(MyRangeAsText As String, MyItem As Integer) As Integer
' analyse a range, where MyItem represents 1=TR, 2=BR, 3=LHC, 4=RHC
Dim UserRange As Range
Set UserRange = Range(MyRangeAsText)
Select Case MyItem
Case 1
MyRng = UserRange.Row
Case 2
MyRng = UserRange.Row + UserRange.Rows.Count - 1
Case 3
MyRng = UserRange.Column
Case 4
MyRng = UserRange.Columns(UserRange.Columns.Count).Column
End Select
Exit Function
End Function
Function FillSpaces(AnyStr As String) As String
' remove any spaces and replace with underscore character
Dim MyPos As Integer
MyPos = InStr(1, AnyStr, " ")
Do While MyPos > 0
Mid(AnyStr, MyPos, 1) = "_"
MyPos = InStr(1, AnyStr, " ")
Loop
FillSpaces = LCase(AnyStr)
End Function
Function FormChk(RowNum As Integer, ColNum As Integer) As String
' formats numeric and date cell values to comma 000's and DD MMM YY
FormChk = Cells(RowNum, ColNum).Value
If IsNumeric(Cells(RowNum, ColNum).Value) Then
FormChk = Format(Cells(RowNum, ColNum).Value, "#,##0 ;(#,##0)")
End If
If IsDate(Cells(RowNum, ColNum).Value) Then
FormChk = Format(Cells(RowNum, ColNum).Value, "dd mmm yy")
End If
End Function
Function RemoveAmpersands(AnyStr As String) As String
Dim MyPos As Integer
' replace Ampersands (&) with plus symbols (+)
MyPos = InStr(1, AnyStr, "&")
Do While MyPos > 0
Mid(AnyStr, MyPos, 1) = "+"
MyPos = InStr(1, AnyStr, "&")
Loop
RemoveAmpersands = AnyStr
End Function
如果不想将文件中的&替换为+,可以将第55行的RemoveAmpersands(FormChk(MyRow, MyCol))替换为Cells(MyRow, MyCol).Value。
需要注意的是在Excel文件中必须有一行是标题行——即改行存放各列的标题,否则生成的XML文件格式不一定是你所需要的。
由于我对XML文件的编码格式有要求,必须是UTF-8,而Windows默认的编码格式是GBXXXX,因此需要对生成的XML文件进行编码格式转换。搜索到了linux 查看文件编码以及修改编码一文,摘录重点如下:
查看文件编码
在Linux中查看文件编码可以通过以下几种方式:
1.在Vim中可以直接查看文件编码
:set fileencoding
即可显示文件编码格式。
如果你只是想查看其它编码格式的文件或者想解决用Vim查看文件乱码的问题,那么你可以在~/.vimrc 文件中添加以下内容:
set encoding=utf-8 fileencodings=ucs-bom,utf-8,cp936
这样,就可以让vim自动识别文件编码(可以自动识别UTF-8或者GBK编码的文件),其实就是依照fileencodings提供的编码列表尝试,如果没有找到合适的编码,就用latin-1(ASCII)编码打开。
2. enca (如果你的系统中没有安装这个命令,可以用sudo yum install -y enca 安装 )查看文件编码
$ enca filename
filename: Universal transformation format 8 bits; UTF-8
CRLF line terminators
需要说明一点的是,enca对某些GBK编码的文件识别的不是很好,识别时会出现:
Unrecognized encoding
文件编码转换
1.在Vim中直接进行转换文件编码,比如将一个文件转换成utf-8格式
:set fileencoding=utf-8
2. enconv 转换文件编码,比如要将一个GBK编码的文件转换成UTF-8编码,操作如下
enconv -L zh_CN -x UTF-8 filename
3. iconv 转换,iconv的命令格式如下:
iconv -f encoding -t encoding inputfile
比如将一个UTF-8 编码的文件转换成GBK编码
iconv -f GBK -t UTF-8 file1 -o file2