open XML获取world文件标题及所需的关键字

最近有个需求,使用c#操作world文件,并获取其中的标题及其他所需引导词后面的内容,如下图,获取文件中的标题,引导词后面的内容
open XML获取world文件标题及所需的关键字_第1张图片
采用的是open xml将world文件转换成xml(open xml只支持docx文件格式,如果不是docx格式的,可以先通过world转换过来),可以获取每个段落中的所有文字喝标签样式,已知标题的字体最大,根据标签样式属性,获取最大的样式的那个段落既是标题,其他内容可以通过indexOf查询关键词进行获取
先看看完成效果
open XML获取world文件标题及所需的关键字_第2张图片
代码如下(初学C#一周,如果有写的不好的地方,望大佬指正):

所需插件:
open XML获取world文件标题及所需的关键字_第3张图片
引入

using System;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Collections;
using System.Xml;
using DocumentFormat.OpenXml.Packaging;

定义

 public class objList
        {
     
            public string title;
            public string biaoji1;
            public string number;
            public string biaoji3;
        }
        public class textObjectAll
        {
     
            public int index;
            public string innerXml;
            public string innerText;
            public int indexof;
        }
        public class textObjectCh
        {
     
            public int index;
            public string innerXml;
            public string innerText;
            public int indexof;
        }
        public enum WordKind
        {
     
            Title,
           biaoji1,
           number,
           biaoji3,
        }

方法:(这是测试文件,所以定义的少,实际文件中会有很多引导词)

public static object GetContentFromWord(string docPath, string[] Kind)
        {
     
            const string wordmlNamespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
            string text = null;
            string keyword = null;
            string keyworda = null;
            string abstracta = null;
            string biaoji1 = "";
            string number = "";
            string biaoji3 = "";
            int num = 0;
            int numa = 0;
            int numb = 0;
            int numx = 0;
         
            ArrayList list = new ArrayList();
            ArrayList lista = new ArrayList();
            ArrayList listb = new ArrayList();
            objList objList = new objList();
            StringBuilder textBuilder = new StringBuilder();
            using (WordprocessingDocument wdDoc = WordprocessingDocument.Open(docPath, false))
            {
     
                NameTable nt = new NameTable();
                XmlNamespaceManager nsManager = new XmlNamespaceManager(nt);
                nsManager.AddNamespace("w", wordmlNamespace);
                XmlDocument xdoc = new XmlDocument(nt);
                xdoc.Load(wdDoc.MainDocumentPart.GetStream());
                XmlNodeList paragraphNodes = xdoc.SelectNodes("//w:p ", nsManager);
                List<textObjectCh> listCh = new List<textObjectCh>();
                List<textObjectAll> listAll = new List<textObjectAll>();
                for (var i = 0; i < paragraphNodes.Count; i++)
                {
     
                    if (!string.IsNullOrEmpty(paragraphNodes[i].InnerText))
                    {
     
                        XmlNodeList textNodes = paragraphNodes[i].SelectNodes(".//w:pStyle", nsManager);
                        XmlNodeList textNodesa = paragraphNodes[i].SelectNodes(".//w:sz", nsManager);
                        textObjectCh TextObjCh = new textObjectCh();
                        textObjectAll TextObjAll = new textObjectAll();
                        TextObjAll.index = i;
                        TextObjAll.innerText = paragraphNodes[i].InnerText;
                        TextObjAll.innerXml = paragraphNodes[i].InnerXml;
                        TextObjAll.indexof = numb;
                        listAll.Add(TextObjAll);
                        string AllinnerText = null;
                        if (paragraphNodes[i].InnerText==null)
                        {
     
                            
                        }
                        else
                             AllinnerText = paragraphNodes[i].InnerText.ToString();
                        for (int v = 0; v < Kind.Length; v++)
                        {
     
                            switch (Kind[v].ToString())
                            {
     
                                case "biaoji1":

                                    if (AllinnerText.IndexOf("标记1") > -1 && AllinnerText.IndexOf("号码") > -1)
                                    {
     
                                        biaoji1 = AllinnerText.Substring(AllinnerText.IndexOf("标记1") + 4, AllinnerText.IndexOf("号码")- (AllinnerText.IndexOf("标记1")+4));
                                    }
                                   
                                    break;
                             
                                case "number":

                                    if (AllinnerText.IndexOf("号码") > -1)
                                        number = AllinnerText.Substring(AllinnerText.IndexOf("号码") + 3);
                                    break;
                                case "biaoji3":

                                    if (AllinnerText.IndexOf("标记3") >-1)
                                    {
     
                                        biaoji3 = AllinnerText.Substring(AllinnerText.IndexOf("DOI") + 4);
                                       
                                    }
                                 
                                  
                                    break;
                              
                            }
                        }
                        if (textNodes.Count > 0)
                        //标题字体样式有两种可能,一种是直接的字体大小(w:sz),这里获取最大的字体,还有一种是使用的标题,例如 标题1,标题2,这里是截取标题后面的数字,取最小的
                        {
     
                            try
                            {
     
                                text = textNodes[0].OuterXml;
                            }
                            catch {
      }

                            num = text.IndexOf("w:val");
                            numa = text.IndexOf("http://");
                            
                            if (numa - num > 20 && numa - num < 25)
                            {
     
                                keyworda = text.Substring(num);
                                keyword = text.Substring(num + 9, numa - num - 20);
                                numx = 1;
                            }
                        }
                        else
                        {
     
                            if (textNodesa.Count > 0)
                            {
     
                                try
                                {
     
                                    text = textNodesa[0].OuterXml;
                                }
                                catch {
      }

                                num = text.IndexOf("w:val");
                                numa = text.IndexOf("xmlns:w");
                                if (numa - num > 9 && numa - num < 20)
                                {
     
                                    keyword = text.Substring(num + 7, numa - num - 9);
                                    numx = 2;
                                }
                            }
                        }
                        try
                        {
     
                            if (int.TryParse(keyword, out numb))
                            {
     
                               
                                    TextObjCh.index = i;
                                    TextObjCh.innerText = paragraphNodes[i].InnerText;
                                    TextObjCh.innerXml = paragraphNodes[i].InnerXml;
                                    TextObjCh.indexof = numb;
                                    listCh.Add(TextObjCh);
                                
                              
                            }
                        }
                        catch
                        {
     
                        }
                    }
                    //}
                    //textBuilder.Append(Environment.NewLine);
                }
                if (numx == 1)
                {
     
                //如果标题采用的是标题1,标题2 这种,采用升序,取第一个
                    List<textObjectCh> listA = listCh.OrderBy(item => item.indexof).ToList();
                 
                    objList.title = listA[0].innerText;
                    if (biaoji1.Length > 0)
                    {
     
                        objList.biaoji1 = biaoji1;
                    }
                    if (number.Length>0)
                    {
     
                        objList.number = number;
                    }
                    if (biaoji3.Length>0)
                    {
     
                        objList.biaoji3 = biaoji3;
                    }
                 
                    
                    return objList;
                }
                else
                {
     
                    List<textObjectCh> listA = listCh.OrderByDescending(item => item.indexof).ToList();
                    objList.title = listA[0].innerText;
                    if (biaoji1.Length > 0)
                    {
     
                        objList.biaoji1 = biaoji1;
                    }
                    if (number.Length > 0)
                    {
     
                        objList.number = number;
                    }
                    if (biaoji3.Length > 0)
                    {
     
                        objList.biaoji3 = biaoji3;
                    }
                    return objList;
                }
            };
        }

调用方法:

 private void button1_Click(object sender, EventArgs e)
        {
     
            OpenFileDialog openFileDialog1 = new OpenFileDialog();  //显示选择文件对话框
            openFileDialog1.InitialDirectory = "c:\\";
            openFileDialog1.Filter = "txt files (*.docx)|*.docx|All files (*.docx)|*.docx"; //注意:open xml只支持docx文件格式
            openFileDialog1.FilterIndex = 2;
            openFileDialog1.RestoreDirectory = true;
            string[] strText = {
     
                        WordKind.Title.ToString(),
                        WordKind.biaoji1.ToString(),
                        WordKind.number.ToString(),
                        WordKind.biaoji3.ToString(),
                    };

            if (openFileDialog1.ShowDialog() == DialogResult.OK)
            {
     
                this.FilePath.Text = openFileDialog1.FileName;   //显示文件路径
                
                objList objList = (objList)GetContentFromWord(openFileDialog1.FileName, strText);
                this.Title.Text = objList.title;
                this.biaoji1.Text = objList.biaoji1;
                this.number.Text = objList.number;
                this.biaoji3.Text = objList.biaoji3;
            }
        }

你可能感兴趣的:(c#从入门到放弃-入门篇)