需求说明:给定OR期刊文章的PDF文档,提取我们感兴趣的文章标题、作者、作者机构、关键字、接受日期、审稿日期、录用日期、出版日期、审稿分区等信息
提取效果:
开放工具:VS2010+Spire Pdf插件
需要组件:按钮(btn_or2018)、文本框(textbox1)
设计思想:
(1)读入pdf文档
PdfDocument pdf = new PdfDocument();
pdf.LoadFromFile(fileName);
PdfPageBase page = pdf.Pages[1];//本次下载的OR文章文章正文第一页页码为 2
(2)首先使用,page.FindText("OPERATIONS RESEARCH").Finds;函数确定是否为OR期刊
(3)使用page.ExtractText函数进行关键标识词定位
(3)根据定位结果,进行相关位置的字符串提取(主要涉及函数SubString)与显示(textbox控件)
源代码:
1. 打开文档
private void btn_or2018_Click(object sender, EventArgs e)
{
OpenFileDialog ofd = new OpenFileDialog();
ofd.Filter = "PDF文档(*.pdf)| *.pdf";
ofd.ShowDialog();
string txtPath = ofd.FileName;
PaperInformation_OR paperOR = new PaperInformation_OR().getPaperInfo(txtPath);
textBox1.Text = paperOR.ToString();
}
2. 定义PaperInformation_OR类
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Spire.Pdf;
using Spire.License;
using System.Drawing;
using System.Windows.Forms;
using System.Reflection;
using System.Data;
namespace OperationsResearch
{
class PaperInformation_OR
{
#region 类的成员变量
public string m_title;
public string m_author;
public string m_organization;
public string m_contact;
public string m_receivedDate;
public string m_revisedDate;
public string m_acceptedDate;
public string m_publishedDate;
public string m_areaOfRiview;
public string m_subjectClassfication;
public string m_doi;
public string m_year;
public string m_volume;
public string m_issue;
public string m_page;
#endregion
#region 构造函数
public PaperInformation_OR(string Title, string Author, string Organization, string Contact,
string Received, string Revised, string Accepted, string Published,
string year, string Volume, string Issue, string Page,
string SubjectClassification, string AreaOfReview, string Doi)
{
m_title = Title; m_author = Author;
m_organization = Organization; m_contact = Contact;
m_receivedDate = Received; m_revisedDate = Revised; m_acceptedDate = Accepted; m_publishedDate = Published;
m_year = year; m_volume = Volume; m_issue = Issue; m_page = Page;
m_subjectClassfication = SubjectClassification; m_areaOfRiview = AreaOfReview;
m_doi = Doi;
}
public PaperInformation_OR() {}
#endregion
//重写ToString函数,输出类内所有成员变量的值
override public string ToString()
{
string tmp = "题目:" + m_title + "\r\n作者:" + m_author
+ "\r\n作者单位:" + m_organization + "\r\n作者联系方式:" + m_contact
+ "\r\n" + m_receivedDate + "\r\n" + m_revisedDate
+ "\r\n" + m_acceptedDate + "\r\n" + m_publishedDate
+ "\r\n年:" + m_year + "\r\n卷:" + m_volume + "\r\n期:" + m_issue
+ "\r\n页码:" + m_page + "\r\ndoi:" + m_doi;
return tmp;
}
//根据出版时间计算文章是第几期
public int getIssue(string PublishTime)
{
string PublishMonth = PublishTime.Split(' ')[0].Split(':')[1];
switch (PublishMonth)
{
case "January":
case "February":
return 1;
case "March":
case "April":
return 2;
case "May":
case "June":
return 3;
case "July":
case "August":
return 4;
case "September":
case "October":
return 5;
case "November":
case "December":
return 6;
default:
break;
}
return 0;
}
//主要用来处理作者单位空格无法识别问题,在字符串中的大写字母处添加空格
public string dealString(string str)
{
str = str.Replace("and", " and").Replace("of", " of");
StringBuilder strBuff = new StringBuilder();
for (int i = 0; i < str.Length; i++)
{
//在大写字母之前加空格
if (Char.IsUpper(str[i]))
{
strBuff.Append(' ');
}
//在数字之前加空格
if (i > 0 && Char.IsLetter(str[i - 1]) && Char.IsDigit(str[i]))
{
strBuff.Append(' ');
}
strBuff.Append(str[i]);
}
return strBuff.ToString();
}
#region 字符串中多个连续空格转为一个空格
///
/// 字符串中多个连续空格转为一个空格
///
/// 待处理的字符串
/// 合并空格后的字符串
public string MergeSpace(string str)
{
if (str != string.Empty &&
str != null &&
str.Length > 0
)
{
str = new System.Text.RegularExpressions.Regex("[\\s]+").Replace(str, " ");
}
return str;
}
#endregion
//根据文献名提取文献信息
public PaperInformation_OR getPaperInfo(string fileName)
{
PdfDocument pdf = new PdfDocument();
pdf.LoadFromFile(fileName);
PdfPageBase page = pdf.Pages[1];
//从第一页的指定矩形区域内提取文本,并剔除其中的换行符
//收稿日期的左上坐标
var pt = page.FindText("Received").Finds[0];
System.Drawing.Point acceptedPos = new System.Drawing.Point((int)pt.Position.X, (int)pt.Position.Y);
//摘要部分的坐上坐标
var pt1 = page.FindText("Abstract").Finds[0];
System.Drawing.Point abstarctPos = new System.Drawing.Point((int)pt1.Position.X, (int)pt1.Position.Y);
//版权部分的左上坐标
var pt2 = page.FindText("Copyright").Finds[0];
System.Drawing.Point CopyrightPos = new System.Drawing.Point((int)pt2.Position.X, (int)pt2.Position.Y);
//---------------------------------获取接受日期到版权页之间的内容---------------------------
string paperInfo = page.ExtractText(new RectangleF(acceptedPos.X, acceptedPos.Y,
abstarctPos.X - acceptedPos.X, CopyrightPos.Y - acceptedPos.Y)).Replace("\r\n", "");
//提取接收时间
string ReceivedTime = paperInfo.Substring(paperInfo.IndexOf("Received"),
paperInfo.IndexOf("Revised") - paperInfo.IndexOf("Received"));
//提取审稿时间
string RevisedTime = paperInfo.Substring(paperInfo.IndexOf("Revised"),
paperInfo.IndexOf("Accepted") - paperInfo.IndexOf("Revised"));
//提取接受时间
string AcceptedTime = paperInfo.Substring(paperInfo.IndexOf("Accepted"),
paperInfo.IndexOf("Published") - paperInfo.IndexOf("Accepted"));
//提取目标分类
string SubjectClassification = paperInfo.Substring(paperInfo.IndexOf("Subject"),
paperInfo.IndexOf("Area") - paperInfo.IndexOf("Subject"));
//提取审稿领域
string AreaOfReview = paperInfo.Substring(paperInfo.IndexOf("Area"),
paperInfo.IndexOf("https") - paperInfo.IndexOf("Area"));
//提取doi链接
string doi = paperInfo.Substring(paperInfo.IndexOf("https"),
paperInfo.IndexOf("Copyright") - paperInfo.IndexOf("https"));
//提取出版年--由doi号推导
string year = doi.Substring(doi.Length - 9, 4);
//提取当前卷信息--第一卷年份为1952年
string volume = (int.Parse(year) - 1952).ToString();
//提取文章期信息
string PublishTime = paperInfo.Substring(paperInfo.IndexOf("Advance:"),
paperInfo.IndexOf("Subject") - paperInfo.IndexOf("Advance:"));
int issue = getIssue(PublishTime);
//-------------------------------获取论文页码、作者、作者单位、通讯方式--------------------
//获取论文页数(默认没有三位数页码的单篇论文)
var pp = page.FindText("pp.").Finds[0];
System.Drawing.Point pageCntPos = new System.Drawing.Point((int)pp.Position.X, (int)pp.Position.Y);
string paperPageCnt = page.ExtractText(new RectangleF(pageCntPos.X, pageCntPos.Y, 50, 10)).Replace("\r\n", "");
//获取论文作者及单位信息
string paperTitleAndAuthor = page.ExtractText(new RectangleF(acceptedPos.X,
pageCntPos.Y + 20, 550, abstarctPos.Y - pageCntPos.Y - 30)).Trim();
//剥离题目及作者----先通过\r\na剥离得到题目和作者组合,然后再识别最后一个换行符进一步剥离作者
string titleAndAuthor = paperTitleAndAuthor.Substring(0,
paperTitleAndAuthor.IndexOf("\r\na") - 2);
string paperTitle = titleAndAuthor.Substring(0,
titleAndAuthor.LastIndexOf("\r\n") - 2).Replace("\r\n", " ");
string paperAuthor = titleAndAuthor.Substring(titleAndAuthor.LastIndexOf("\r\n"),
titleAndAuthor.Length - titleAndAuthor.LastIndexOf("\r\n")).Replace("\r\n", " ");
//剥离作者单位
string authorOrganization = paperTitleAndAuthor.Substring(paperTitleAndAuthor.IndexOf("\r\na") + 2,
paperTitleAndAuthor.IndexOf("Contact:") - paperTitleAndAuthor.IndexOf("\r\na") - 3).Replace("\r\n", " ");
authorOrganization = dealString(authorOrganization).Trim();
//剥离作者的联系方式
string contactAuthor = paperTitleAndAuthor.Substring(paperTitleAndAuthor.IndexOf("Contact:"),
paperTitleAndAuthor.Length - paperTitleAndAuthor.IndexOf("Contact:")).Replace("\r\n", " ");
//---------------------------------输出到paperInfo结构体中--------------------------------
return new PaperInformation_OR(paperTitle, paperAuthor, authorOrganization,
contactAuthor, ReceivedTime, ReceivedTime, AcceptedTime, PublishTime,
year, volume, issue.ToString(),paperPageCnt, SubjectClassification, AreaOfReview, doi);
}
public PaperInformation_OR getPaperInfo2015(string fileName)
{
PdfDocument pdf = new PdfDocument();
pdf.LoadFromFile(fileName);
PdfPageBase page = pdf.Pages[1];
var orLogo = page.FindText("OPERATIONS RESEARCH").Finds;
if (orLogo.Length == 0)
{
MessageBox.Show("该文档非OR期刊");
return new PaperInformation_OR();
}
//获取文献
//获取-年-卷-期-页码信息
var pp = page.FindText("Vol.").Finds[0];
if (pp == null)
{
MessageBox.Show("该文档非标准2015格式");
return new PaperInformation_OR();
}
System.Drawing.Point pageCntPos = new System.Drawing.Point((int)pp.Position.X, (int)pp.Position.Y);
string paperYearVolIssuePage = page.ExtractText(new RectangleF(pageCntPos.X, pageCntPos.Y, 200, 10));
string[] tmp1 = paperYearVolIssuePage.Split(',');
//卷-期-年-页码
string paperVolume = tmp1[0];
string paperIssue = tmp1[1];
string paperYear = tmp1[2].Substring(tmp1[2].Length - 5, 5);
string paperPage = tmp1[3].Replace("\r\n", "");
//获取文章网址
var doi = page.FindText("http:").Finds[0];
System.Drawing.Point doiPos = new System.Drawing.Point((int)doi.Position.X, (int)doi.Position.Y);
string paperDoi = page.ExtractText(new RectangleF(doiPos.X, doiPos.Y, 150, 7)).Trim();
//获取文章其它信息
var hisInfo = page.FindText("History:").Finds;
if (hisInfo.Length == 0)
{
MessageBox.Show("该文档非标准2015格式,定位日期信息错误");
return new PaperInformation_OR();
}
var history = page.FindText("History:").Finds[0];
System.Drawing.Point historyPos = new System.Drawing.Point((int)history.Position.X, (int)history.Position.Y);
string paperOthers = page.ExtractText(new RectangleF(historyPos.X, doiPos.Y + 20,
520, historyPos.Y - doiPos.Y)).Trim();
//-----------------------------------------------------------------------------------------------
//获取文章题目
int titleIndex = paperOthers.IndexOf("\r\n\r\n\r\n\r\n");
string paperTitle, paperAuthor = " ", paperAuthorCompany = " ", paperContact = " ", paperPublish = "出版日期";
if (titleIndex > 0)
{
string paperTitle2 = paperOthers.Substring(0, paperOthers.IndexOf("\r\n\r\n\r\n")).Replace("\r\n", "");
paperTitle = paperTitle2;
}
else
{
string paperTitle2 = paperOthers.Substring(0, paperOthers.IndexOf("\r\n\r\n\r\n")).Replace("\r\n", "");
paperTitle = MergeSpace(paperTitle2);
}
//-----------------------------------------------------------------------------------------------
//文章分类信息
int scIndex = paperOthers.IndexOf("Subject classiflcations:");
int aorIndex = paperOthers.IndexOf("Area of review");
int hisIndex = paperOthers.IndexOf("History:");
string paperSubClass = paperOthers.Substring(scIndex, aorIndex - scIndex).Replace("\r\n", "").Trim();
//文章审稿区域
string paperAreaOfReview = paperOthers.Substring(aorIndex, hisIndex - aorIndex).Replace("\r\n", "").Trim();
//得到投稿、审稿、接受信息
string paperDateInfo = paperOthers.Substring(hisIndex, paperOthers.Length - hisIndex).Replace("\r\n", "").Trim();
string[] tmp3 = paperDateInfo.Split(':')[1].Split(';');
string paperReceivedDate = tmp3[0];
string paperRevisedDate = tmp3[1];
string paperAcceptedDate = tmp3[2];
return new PaperInformation_OR(paperTitle,paperAuthor, paperAuthorCompany, paperContact,
paperReceivedDate, paperRevisedDate, paperAcceptedDate, paperPublish,
paperYear, paperVolume, paperIssue, paperPage,
paperSubClass, paperAreaOfReview, paperDoi);
}
}
}