最近有个需求,使用c#操作world文件,并获取其中的标题及其他所需引导词后面的内容,如下图,获取文件中的标题,引导词后面的内容
采用的是open xml将world文件转换成xml(open xml只支持docx文件格式,如果不是docx格式的,可以先通过world转换过来),可以获取每个段落中的所有文字喝标签样式,已知标题的字体最大,根据标签样式属性,获取最大的样式的那个段落既是标题,其他内容可以通过indexOf查询关键词进行获取
先看看完成效果
代码如下(初学C#一周,如果有写的不好的地方,望大佬指正):
using System;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Collections;
using System.Xml;
using DocumentFormat.OpenXml.Packaging;
定义
public class objList
{
public string title;
public string biaoji1;
public string number;
public string biaoji3;
}
public class textObjectAll
{
public int index;
public string innerXml;
public string innerText;
public int indexof;
}
public class textObjectCh
{
public int index;
public string innerXml;
public string innerText;
public int indexof;
}
public enum WordKind
{
Title,
biaoji1,
number,
biaoji3,
}
方法:(这是测试文件,所以定义的少,实际文件中会有很多引导词)
public static object GetContentFromWord(string docPath, string[] Kind)
{
const string wordmlNamespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
string text = null;
string keyword = null;
string keyworda = null;
string abstracta = null;
string biaoji1 = "";
string number = "";
string biaoji3 = "";
int num = 0;
int numa = 0;
int numb = 0;
int numx = 0;
ArrayList list = new ArrayList();
ArrayList lista = new ArrayList();
ArrayList listb = new ArrayList();
objList objList = new objList();
StringBuilder textBuilder = new StringBuilder();
using (WordprocessingDocument wdDoc = WordprocessingDocument.Open(docPath, false))
{
NameTable nt = new NameTable();
XmlNamespaceManager nsManager = new XmlNamespaceManager(nt);
nsManager.AddNamespace("w", wordmlNamespace);
XmlDocument xdoc = new XmlDocument(nt);
xdoc.Load(wdDoc.MainDocumentPart.GetStream());
XmlNodeList paragraphNodes = xdoc.SelectNodes("//w:p ", nsManager);
List<textObjectCh> listCh = new List<textObjectCh>();
List<textObjectAll> listAll = new List<textObjectAll>();
for (var i = 0; i < paragraphNodes.Count; i++)
{
if (!string.IsNullOrEmpty(paragraphNodes[i].InnerText))
{
XmlNodeList textNodes = paragraphNodes[i].SelectNodes(".//w:pStyle", nsManager);
XmlNodeList textNodesa = paragraphNodes[i].SelectNodes(".//w:sz", nsManager);
textObjectCh TextObjCh = new textObjectCh();
textObjectAll TextObjAll = new textObjectAll();
TextObjAll.index = i;
TextObjAll.innerText = paragraphNodes[i].InnerText;
TextObjAll.innerXml = paragraphNodes[i].InnerXml;
TextObjAll.indexof = numb;
listAll.Add(TextObjAll);
string AllinnerText = null;
if (paragraphNodes[i].InnerText==null)
{
}
else
AllinnerText = paragraphNodes[i].InnerText.ToString();
for (int v = 0; v < Kind.Length; v++)
{
switch (Kind[v].ToString())
{
case "biaoji1":
if (AllinnerText.IndexOf("标记1") > -1 && AllinnerText.IndexOf("号码") > -1)
{
biaoji1 = AllinnerText.Substring(AllinnerText.IndexOf("标记1") + 4, AllinnerText.IndexOf("号码")- (AllinnerText.IndexOf("标记1")+4));
}
break;
case "number":
if (AllinnerText.IndexOf("号码") > -1)
number = AllinnerText.Substring(AllinnerText.IndexOf("号码") + 3);
break;
case "biaoji3":
if (AllinnerText.IndexOf("标记3") >-1)
{
biaoji3 = AllinnerText.Substring(AllinnerText.IndexOf("DOI") + 4);
}
break;
}
}
if (textNodes.Count > 0)
//标题字体样式有两种可能,一种是直接的字体大小(w:sz),这里获取最大的字体,还有一种是使用的标题,例如 标题1,标题2,这里是截取标题后面的数字,取最小的
{
try
{
text = textNodes[0].OuterXml;
}
catch {
}
num = text.IndexOf("w:val");
numa = text.IndexOf("http://");
if (numa - num > 20 && numa - num < 25)
{
keyworda = text.Substring(num);
keyword = text.Substring(num + 9, numa - num - 20);
numx = 1;
}
}
else
{
if (textNodesa.Count > 0)
{
try
{
text = textNodesa[0].OuterXml;
}
catch {
}
num = text.IndexOf("w:val");
numa = text.IndexOf("xmlns:w");
if (numa - num > 9 && numa - num < 20)
{
keyword = text.Substring(num + 7, numa - num - 9);
numx = 2;
}
}
}
try
{
if (int.TryParse(keyword, out numb))
{
TextObjCh.index = i;
TextObjCh.innerText = paragraphNodes[i].InnerText;
TextObjCh.innerXml = paragraphNodes[i].InnerXml;
TextObjCh.indexof = numb;
listCh.Add(TextObjCh);
}
}
catch
{
}
}
//}
//textBuilder.Append(Environment.NewLine);
}
if (numx == 1)
{
//如果标题采用的是标题1,标题2 这种,采用升序,取第一个
List<textObjectCh> listA = listCh.OrderBy(item => item.indexof).ToList();
objList.title = listA[0].innerText;
if (biaoji1.Length > 0)
{
objList.biaoji1 = biaoji1;
}
if (number.Length>0)
{
objList.number = number;
}
if (biaoji3.Length>0)
{
objList.biaoji3 = biaoji3;
}
return objList;
}
else
{
List<textObjectCh> listA = listCh.OrderByDescending(item => item.indexof).ToList();
objList.title = listA[0].innerText;
if (biaoji1.Length > 0)
{
objList.biaoji1 = biaoji1;
}
if (number.Length > 0)
{
objList.number = number;
}
if (biaoji3.Length > 0)
{
objList.biaoji3 = biaoji3;
}
return objList;
}
};
}
调用方法:
private void button1_Click(object sender, EventArgs e)
{
OpenFileDialog openFileDialog1 = new OpenFileDialog(); //显示选择文件对话框
openFileDialog1.InitialDirectory = "c:\\";
openFileDialog1.Filter = "txt files (*.docx)|*.docx|All files (*.docx)|*.docx"; //注意:open xml只支持docx文件格式
openFileDialog1.FilterIndex = 2;
openFileDialog1.RestoreDirectory = true;
string[] strText = {
WordKind.Title.ToString(),
WordKind.biaoji1.ToString(),
WordKind.number.ToString(),
WordKind.biaoji3.ToString(),
};
if (openFileDialog1.ShowDialog() == DialogResult.OK)
{
this.FilePath.Text = openFileDialog1.FileName; //显示文件路径
objList objList = (objList)GetContentFromWord(openFileDialog1.FileName, strText);
this.Title.Text = objList.title;
this.biaoji1.Text = objList.biaoji1;
this.number.Text = objList.number;
this.biaoji3.Text = objList.biaoji3;
}
}