最近项目有一个需求,将多个word文件中的表格内容导入到Excel中,以方便下一步的处理,表格的格式是相同的。在网上找了很多资料,终于使用OpenXML SDK实现了,在此也把源代码分享给大家。
主要参考文章 http://blog.darkthread.net/blogs/darkthreadtw/archive/2010/06/01/6454.aspx
关键代码:
一、将DOC格式文件转为DOCX:
因为OpenXML SDK只支持DOCX格式文件,因此首先要把DOC格式文件转为DOCX。
/// <summary>
/// 格式转换 DOC -> DOCX
/// </summary>
/// <param name="pathSource"></param>
/// <param name="pathTarget"></param>
public static void DocToDocx(string pathSource, string pathTarget)
{
object missing = System.Reflection.Missing.Value;
Word.Application wordApp = new Word.Application();
wordApp.Visible = false;
Word.Document doc = null;
object path1 = pathSource;
doc = wordApp.Documents.Open(ref path1,
ref missing, ref missing, ref missing, ref missing, ref missing,
ref missing, ref missing, ref missing, ref missing, ref missing,
ref missing, ref missing, ref missing, ref missing, ref missing);
object path2 = pathTarget;
object fileType = Word.WdSaveFormat.wdFormatDocumentDefault;
object compatibilityMode = Word.WdCompatibilityMode.wdWord2010;
if (doc.SaveFormat == (int)Word.WdSaveFormat.wdFormatDocument)
{
doc.SaveAs2(ref path2, ref fileType,
ref missing, ref missing, ref missing, ref missing, ref missing,
ref missing, ref missing, ref missing, ref missing, ref missing,
ref missing, ref missing, ref missing, ref missing, ref compatibilityMode);
}
if (doc != null) doc.Close(ref missing, ref missing, ref missing);
wordApp.Quit(ref missing, ref missing, ref missing);
}
二、从DOCX文件中提取表格、行、单元格及内容
public static class DocxTableExt
{
public static Table[] GetTables(this Body body)
{
return body.Elements<Table>().ToArray();
}
public static TableRow[] GetTableRows(this Table tbl)
{
return tbl.Elements<TableRow>().ToArray();
}
public static TableCell[] GetTableCells(this TableRow tr)
{
return tr.Elements<TableCell>().ToArray();
}
public static string GetTableCellContent(this TableCell td)
{
return string.Join("\n", td.Elements<Paragraph>().Select(o => o.InnerText).ToArray());
}
}
三、根据配置文件提取对应单元格数据放到DataRow中
/// <summary>
/// 从word表格中提取对应数据到数据行中
/// </summary>
/// <param name="dt"></param>
/// <param name="pathSource"></param>
/// <param name="xmlConfig"></param>
/// <returns></returns>
public DataRow CreatRow(DataTable dt, string pathSource, XmlConfig xmlConfig)
{
DataRow dr = dt.NewRow();
using (WordprocessingDocument doc = WordprocessingDocument.Open(pathSource, false))
{
var tables = doc.MainDocumentPart.Document.Body.GetTables();
for (int tableIndex = 0; tableIndex < tables.Length; tableIndex++)
{
Table table = doc.MainDocumentPart.Document.Body.GetTables()[tableIndex];
var rows = table.GetTableRows();
for (int rowIndex = 0; rowIndex < rows.Length; rowIndex++)
{
var cells = rows[rowIndex].GetTableCells();
for (int columnIndex = 0; columnIndex < cells.Length; columnIndex++)
{
foreach (CellClass cell in xmlConfig.Import)
{
if ((tableIndex == cell.TableIndex - 1) && (rowIndex == cell.RowIndex - 1) && (columnIndex == cell.ColumnIndex - 1))
{
dr[cell.Title] = cells[columnIndex].GetTableCellContent();
}
}
}
}
}
}
return dr;
}
}
四、配置文件示例
<?xml version="1.0" encoding="utf-8"?> <XmlConfig xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"> <Import> <Item Title="姓名" Table="1" Row="1" Column="2" /> <Item Title="性别" Table="1" Row="2" Column="2" /> <Item Title="单位" Table="1" Row="8" Column="4" /> <Item Title="工作简历" Table="1" Row="9" Column="2" /> </Import> <Export RowStart="1" ColumnStart="1" /> </XmlConfig>
说明:
<Item Title="姓名" Table="1" Row="1" Column="2" /> 表示将word中第1个表格的第1行第2列处的数据提取到Excel中姓名列
<Export RowStart="1" ColumnStart="1" /> 表示导出的Excel数据从第1行第1列开始
五、下载