C#编程学习08:Spire Pdf组件的引用,以国际知名期刊OR为例

需求说明:给定OR期刊文章的PDF文档,提取我们感兴趣的文章标题、作者、作者机构、关键字、接受日期、审稿日期、录用日期、出版日期、审稿分区等信息

C#编程学习08:Spire Pdf组件的引用,以国际知名期刊OR为例_第1张图片


提取效果:

C#编程学习08:Spire Pdf组件的引用,以国际知名期刊OR为例_第2张图片


开放工具:VS2010+Spire Pdf插件

需要组件:按钮(btn_or2018)、文本框(textbox1)


设计思想:

(1)读入pdf文档

           PdfDocument pdf = new PdfDocument();
            pdf.LoadFromFile(fileName);
            PdfPageBase page = pdf.Pages[1];//本次下载的OR文章文章正文第一页页码为 2

(2)首先使用,page.FindText("OPERATIONS RESEARCH").Finds;函数确定是否为OR期刊

(3)使用page.ExtractText函数进行关键标识词定位

(3)根据定位结果,进行相关位置的字符串提取(主要涉及函数SubString)与显示(textbox控件)


源代码:

1. 打开文档

        private void btn_or2018_Click(object sender, EventArgs e)
        {
            OpenFileDialog ofd = new OpenFileDialog();
            ofd.Filter = "PDF文档(*.pdf)| *.pdf";
            ofd.ShowDialog();
            string txtPath = ofd.FileName;
            PaperInformation_OR paperOR = new PaperInformation_OR().getPaperInfo(txtPath);
            textBox1.Text = paperOR.ToString();
        }

2. 定义PaperInformation_OR类

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Spire.Pdf;
using Spire.License;
using System.Drawing;
using System.Windows.Forms;
using System.Reflection;
using System.Data;



namespace OperationsResearch
{
    class PaperInformation_OR
    {
        #region 类的成员变量
        public string m_title;
        public string m_author;
        public string m_organization;
        public string m_contact;
        public string m_receivedDate;
        public string m_revisedDate;
        public string m_acceptedDate;
        public string m_publishedDate;
        public string m_areaOfRiview;
        public string m_subjectClassfication;
        public string m_doi;
        public string m_year;
        public string m_volume;
        public string m_issue;
        public string m_page;
        #endregion

        #region 构造函数
        public PaperInformation_OR(string Title, string Author, string Organization, string Contact,
            string Received, string Revised, string Accepted, string Published,
            string year, string Volume, string Issue, string Page, 
            string SubjectClassification, string AreaOfReview, string Doi)
        {
            m_title = Title; m_author = Author;
            m_organization = Organization; m_contact = Contact;
            m_receivedDate = Received; m_revisedDate = Revised; m_acceptedDate = Accepted; m_publishedDate = Published;
            m_year = year; m_volume = Volume; m_issue = Issue; m_page = Page;
            m_subjectClassfication = SubjectClassification; m_areaOfRiview = AreaOfReview;
            m_doi = Doi;
        }

        public PaperInformation_OR() {}
        #endregion

        //重写ToString函数,输出类内所有成员变量的值
        override public string ToString()
        {
            string tmp = "题目:" + m_title + "\r\n作者:" + m_author 
                 + "\r\n作者单位:" + m_organization + "\r\n作者联系方式:" + m_contact
                 + "\r\n" + m_receivedDate + "\r\n" + m_revisedDate 
                 + "\r\n" + m_acceptedDate + "\r\n" + m_publishedDate
                 + "\r\n年:" + m_year + "\r\n卷:" + m_volume + "\r\n期:" + m_issue
                 + "\r\n页码:" + m_page + "\r\ndoi:" + m_doi;
            return tmp;
        }

        //根据出版时间计算文章是第几期
        public int getIssue(string PublishTime)
        {
            string PublishMonth = PublishTime.Split(' ')[0].Split(':')[1];

            switch (PublishMonth)
            {
                case "January":
                case "February":
                    return 1;
                case "March":
                case "April":
                    return 2;
                case "May":
                case "June":
                    return 3;
                case "July":
                case "August":
                    return 4;
                case "September":
                case "October":
                    return 5;
                case "November":
                case "December":
                    return 6;
                default:
                    break;
            }
            return 0;
        }
        //主要用来处理作者单位空格无法识别问题,在字符串中的大写字母处添加空格
        public string dealString(string str)
        {
            str = str.Replace("and", " and").Replace("of", " of");

            StringBuilder strBuff = new StringBuilder();

            for (int i = 0; i < str.Length; i++)
            {
                //在大写字母之前加空格
                if (Char.IsUpper(str[i]))
                {
                    strBuff.Append(' ');
                }
                //在数字之前加空格
                if (i > 0 && Char.IsLetter(str[i - 1]) && Char.IsDigit(str[i]))
                {
                    strBuff.Append(' ');
                }
                strBuff.Append(str[i]);
            }
            return strBuff.ToString();
        }
        #region 字符串中多个连续空格转为一个空格
        ///  
        /// 字符串中多个连续空格转为一个空格 
        ///  
        /// 待处理的字符串 
        /// 合并空格后的字符串 
        public string MergeSpace(string str)
        {
            if (str != string.Empty &&
            str != null &&
            str.Length > 0
            )
            {
                str = new System.Text.RegularExpressions.Regex("[\\s]+").Replace(str, " ");
            }
            return str;
        }
        #endregion
        //根据文献名提取文献信息
        public PaperInformation_OR getPaperInfo(string fileName)
        {

            PdfDocument pdf = new PdfDocument();
            pdf.LoadFromFile(fileName);
            PdfPageBase page = pdf.Pages[1];

            //从第一页的指定矩形区域内提取文本,并剔除其中的换行符
            //收稿日期的左上坐标
            var pt = page.FindText("Received").Finds[0];
            System.Drawing.Point acceptedPos = new System.Drawing.Point((int)pt.Position.X, (int)pt.Position.Y);
            //摘要部分的坐上坐标
            var pt1 = page.FindText("Abstract").Finds[0];
            System.Drawing.Point abstarctPos = new System.Drawing.Point((int)pt1.Position.X, (int)pt1.Position.Y);
            //版权部分的左上坐标
            var pt2 = page.FindText("Copyright").Finds[0];
            System.Drawing.Point CopyrightPos = new System.Drawing.Point((int)pt2.Position.X, (int)pt2.Position.Y);           

            //---------------------------------获取接受日期到版权页之间的内容---------------------------
            string paperInfo = page.ExtractText(new RectangleF(acceptedPos.X, acceptedPos.Y,
                abstarctPos.X - acceptedPos.X, CopyrightPos.Y - acceptedPos.Y)).Replace("\r\n", "");

            //提取接收时间
            string ReceivedTime = paperInfo.Substring(paperInfo.IndexOf("Received"),
                paperInfo.IndexOf("Revised") - paperInfo.IndexOf("Received"));
            //提取审稿时间
            string RevisedTime = paperInfo.Substring(paperInfo.IndexOf("Revised"),
                paperInfo.IndexOf("Accepted") - paperInfo.IndexOf("Revised"));
            //提取接受时间
            string AcceptedTime = paperInfo.Substring(paperInfo.IndexOf("Accepted"),
                paperInfo.IndexOf("Published") - paperInfo.IndexOf("Accepted"));
            //提取目标分类
            string SubjectClassification = paperInfo.Substring(paperInfo.IndexOf("Subject"),
                paperInfo.IndexOf("Area") - paperInfo.IndexOf("Subject"));
            //提取审稿领域
            string AreaOfReview = paperInfo.Substring(paperInfo.IndexOf("Area"),
                paperInfo.IndexOf("https") - paperInfo.IndexOf("Area"));
            //提取doi链接
            string doi = paperInfo.Substring(paperInfo.IndexOf("https"),
                paperInfo.IndexOf("Copyright") - paperInfo.IndexOf("https"));
            //提取出版年--由doi号推导
            string year = doi.Substring(doi.Length - 9, 4);
            //提取当前卷信息--第一卷年份为1952年
            string volume = (int.Parse(year) - 1952).ToString();
            //提取文章期信息
            string PublishTime = paperInfo.Substring(paperInfo.IndexOf("Advance:"),
                paperInfo.IndexOf("Subject") - paperInfo.IndexOf("Advance:"));
            int issue = getIssue(PublishTime);

            //-------------------------------获取论文页码、作者、作者单位、通讯方式--------------------
            //获取论文页数(默认没有三位数页码的单篇论文)
            var pp = page.FindText("pp.").Finds[0];
            System.Drawing.Point pageCntPos = new System.Drawing.Point((int)pp.Position.X, (int)pp.Position.Y);
            string paperPageCnt = page.ExtractText(new RectangleF(pageCntPos.X, pageCntPos.Y, 50, 10)).Replace("\r\n", "");
           

            //获取论文作者及单位信息
            string paperTitleAndAuthor = page.ExtractText(new RectangleF(acceptedPos.X,
                pageCntPos.Y + 20, 550, abstarctPos.Y - pageCntPos.Y - 30)).Trim();

            //剥离题目及作者----先通过\r\na剥离得到题目和作者组合,然后再识别最后一个换行符进一步剥离作者
            string titleAndAuthor = paperTitleAndAuthor.Substring(0,
                paperTitleAndAuthor.IndexOf("\r\na") - 2);
            string paperTitle = titleAndAuthor.Substring(0,
                titleAndAuthor.LastIndexOf("\r\n") - 2).Replace("\r\n", " ");
            string paperAuthor = titleAndAuthor.Substring(titleAndAuthor.LastIndexOf("\r\n"),
                titleAndAuthor.Length - titleAndAuthor.LastIndexOf("\r\n")).Replace("\r\n", " ");

            //剥离作者单位
            string authorOrganization = paperTitleAndAuthor.Substring(paperTitleAndAuthor.IndexOf("\r\na") + 2,
                paperTitleAndAuthor.IndexOf("Contact:") - paperTitleAndAuthor.IndexOf("\r\na") - 3).Replace("\r\n", " ");
            authorOrganization = dealString(authorOrganization).Trim();
            //剥离作者的联系方式
            string contactAuthor = paperTitleAndAuthor.Substring(paperTitleAndAuthor.IndexOf("Contact:"),
                paperTitleAndAuthor.Length - paperTitleAndAuthor.IndexOf("Contact:")).Replace("\r\n", " ");
            
            //---------------------------------输出到paperInfo结构体中--------------------------------
            return new PaperInformation_OR(paperTitle, paperAuthor, authorOrganization, 
                contactAuthor, ReceivedTime, ReceivedTime, AcceptedTime, PublishTime,
                year, volume, issue.ToString(),paperPageCnt, SubjectClassification, AreaOfReview, doi);
        }

        public PaperInformation_OR getPaperInfo2015(string fileName)
        {
            PdfDocument pdf = new PdfDocument();
            pdf.LoadFromFile(fileName);
            PdfPageBase page = pdf.Pages[1];
            var orLogo = page.FindText("OPERATIONS RESEARCH").Finds;
            if (orLogo.Length == 0)
            {
                MessageBox.Show("该文档非OR期刊");
                return new PaperInformation_OR();
            }
            //获取文献
            //获取-年-卷-期-页码信息
            var pp = page.FindText("Vol.").Finds[0];
            if (pp == null)
            {
                MessageBox.Show("该文档非标准2015格式");
                return new PaperInformation_OR();
            }
            System.Drawing.Point pageCntPos = new System.Drawing.Point((int)pp.Position.X, (int)pp.Position.Y);
            string paperYearVolIssuePage = page.ExtractText(new RectangleF(pageCntPos.X, pageCntPos.Y, 200, 10));
            string[] tmp1 = paperYearVolIssuePage.Split(',');
            //卷-期-年-页码
            string paperVolume = tmp1[0];
            string paperIssue = tmp1[1];
            string paperYear = tmp1[2].Substring(tmp1[2].Length - 5, 5);
            string paperPage = tmp1[3].Replace("\r\n", "");
            //获取文章网址
            var doi = page.FindText("http:").Finds[0];
            System.Drawing.Point doiPos = new System.Drawing.Point((int)doi.Position.X, (int)doi.Position.Y);
            string paperDoi = page.ExtractText(new RectangleF(doiPos.X, doiPos.Y, 150, 7)).Trim();
            //获取文章其它信息
            var hisInfo = page.FindText("History:").Finds;
            if (hisInfo.Length == 0)
            {
                MessageBox.Show("该文档非标准2015格式,定位日期信息错误");
                return new PaperInformation_OR();
            }
            var history = page.FindText("History:").Finds[0];
            System.Drawing.Point historyPos = new System.Drawing.Point((int)history.Position.X, (int)history.Position.Y);
            string paperOthers = page.ExtractText(new RectangleF(historyPos.X, doiPos.Y + 20,
                520, historyPos.Y - doiPos.Y)).Trim();
            //-----------------------------------------------------------------------------------------------
            //获取文章题目
            int titleIndex = paperOthers.IndexOf("\r\n\r\n\r\n\r\n");          
            string paperTitle, paperAuthor = " ", paperAuthorCompany = " ", paperContact = " ", paperPublish = "出版日期";
            if (titleIndex > 0)
            {
                string paperTitle2 = paperOthers.Substring(0, paperOthers.IndexOf("\r\n\r\n\r\n")).Replace("\r\n", "");
                paperTitle = paperTitle2;
            }
            else
            {
                string paperTitle2 = paperOthers.Substring(0, paperOthers.IndexOf("\r\n\r\n\r\n")).Replace("\r\n", "");
                paperTitle = MergeSpace(paperTitle2);
            }
            //-----------------------------------------------------------------------------------------------
            //文章分类信息
            int scIndex = paperOthers.IndexOf("Subject classiflcations:");
            int aorIndex = paperOthers.IndexOf("Area of review");
            int hisIndex = paperOthers.IndexOf("History:");
            string paperSubClass = paperOthers.Substring(scIndex, aorIndex - scIndex).Replace("\r\n", "").Trim();
            //文章审稿区域
            string paperAreaOfReview = paperOthers.Substring(aorIndex, hisIndex - aorIndex).Replace("\r\n", "").Trim();
            //得到投稿、审稿、接受信息
            string paperDateInfo = paperOthers.Substring(hisIndex, paperOthers.Length - hisIndex).Replace("\r\n", "").Trim();
            string[] tmp3 = paperDateInfo.Split(':')[1].Split(';');
            string paperReceivedDate = tmp3[0];
            string paperRevisedDate = tmp3[1];
            string paperAcceptedDate = tmp3[2];
                     
            return new PaperInformation_OR(paperTitle,paperAuthor, paperAuthorCompany, paperContact,
                paperReceivedDate, paperRevisedDate, paperAcceptedDate, paperPublish, 
                paperYear, paperVolume, paperIssue, paperPage,
                paperSubClass, paperAreaOfReview, paperDoi);
            
        }

        
    }
}

 

你可能感兴趣的:(C#编程学习)