C#解析PDF的方式有很多,比较好用的有ITestSharp和PdfBox。
PDF内容页如果是图片类型,例如扫描件,则需要进行OCR(光学字符识别)。
文本内容的PDF文档,解析的过程中,我目前仅发现能以字符串的形式读取的,不能够读取其中的表格。据说PDF文档结构中是没有表格概念的,因此这个自然是读不到的,如果果真如此,则PDF中表格内容的解析,只能对获取到的字符串按照一定的逻辑自行解析了。
ITestSharp是一C#开源项目,PdfBox为Java开源项目,借助于IKVM在.Net平台下有实现。
Pdf转换Image,使用的是GhostScript,可以以API的方式调用,也可以以Windows命令行的方式调用。
OCR使用的是Asprise,识别效果较好(商业),另外还可以使用MS的ImageScaning(2007)或OneNote(2010)(需要依赖Office组件),Tessert(HP->Google)(效果很差)。
附上ITestSharp、PdfBox对PDF的解析代码。
ITestSharp辅助类
1 using System; 2 using System.Collections.Generic; 3 using System.Text; 4 5 using iTextSharp.text.pdf; 6 using iTextSharp.text.pdf.parser; 7 using System.IO; 8 9 namespace eyuan 10 { 11 public static class ITextSharpHandler 12 { 13 /// <summary> 14 /// 读取PDF文本内容 15 /// </summary> 16 /// <param name="fileName"></param> 17 /// <returns></returns> 18 public static string ReadPdf(string fileName) 19 { 20 if (!File.Exists(fileName)) 21 { 22 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName); 23 return string.Empty; 24 } 25 // 26 string fileContent = string.Empty; 27 StringBuilder sbFileContent = new StringBuilder(); 28 //打开文件 29 PdfReader reader = null; 30 try 31 { 32 reader = new PdfReader(fileName); 33 } 34 catch (Exception ex) 35 { 36 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); 37 38 if (reader != null) 39 { 40 reader.Close(); 41 reader = null; 42 } 43 44 return string.Empty; 45 } 46 47 try 48 { 49 //循环各页(索引从1开始) 50 for (int i = 1; i <= reader.NumberOfPages; i++) 51 { 52 sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i)); 53 54 } 55 56 } 57 catch (Exception ex) 58 { 59 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); 60 61 } 62 finally 63 { 64 if (reader != null) 65 { 66 reader.Close(); 67 reader = null; 68 } 69 } 70 // 71 fileContent = sbFileContent.ToString(); 72 return fileContent; 73 } 74 /// <summary> 75 /// 获取PDF页数 76 /// </summary> 77 /// <param name="fileName"></param> 78 /// <returns></returns> 79 public static int GetPdfPageCount(string fileName) 80 { 81 if (!File.Exists(fileName)) 82 { 83 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName); 84 return -1; 85 } 86 //打开文件 87 PdfReader reader = null; 88 try 89 { 90 reader = new PdfReader(fileName); 91 } 92 catch (Exception ex) 93 { 94 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); 95 96 if (reader != null) 97 { 98 reader.Close(); 99 reader = null; 100 } 101 102 return -1; 103 } 104 // 105 return reader.NumberOfPages; 106 } 107 } 108 }
PDFBox辅助类
1 using org.pdfbox.pdmodel; 2 using org.pdfbox.util; 3 using System; 4 using System.Collections.Generic; 5 using System.IO; 6 using System.Text; 7 8 namespace eyuan 9 { 10 public static class PdfBoxHandler 11 { 12 /// <summary> 13 /// 使用PDFBox组件进行解析 14 /// </summary> 15 /// <param name="input">PDF文件路径</param> 16 /// <returns>PDF文本内容</returns> 17 public static string ReadPdf(string input) 18 { 19 if (!File.Exists(input)) 20 { 21 LogHandler.LogWrite(@"指定的PDF文件不存在:" + input); 22 return null; 23 } 24 else 25 { 26 PDDocument pdfdoc = null; 27 string strPDFText = null; 28 PDFTextStripper stripper = null; 29 30 try 31 { 32 //加载PDF文件 33 pdfdoc = PDDocument.load(input); 34 } 35 catch (Exception ex) 36 { 37 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() })); 38 39 if (pdfdoc != null) 40 { 41 pdfdoc.close(); 42 pdfdoc = null; 43 } 44 45 return null; 46 } 47 48 try 49 { 50 //解析PDF文件 51 stripper = new PDFTextStripper(); 52 strPDFText = stripper.getText(pdfdoc); 53 54 55 56 } 57 catch (Exception ex) 58 { 59 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() })); 60 61 } 62 finally 63 { 64 if (pdfdoc != null) 65 { 66 pdfdoc.close(); 67 pdfdoc = null; 68 } 69 } 70 71 return strPDFText; 72 } 73 74 } 75 } 76 }
另外附上PDF转Image,然后对Image进行OCR的代码。
转换PDF为Jpeg图片代码(GhostScript辅助类)
1 using System; 2 using System.Collections; 3 using System.Collections.Generic; 4 using System.Runtime.InteropServices; 5 using System.Text; 6 7 namespace eyuan 8 { 9 public class GhostscriptHandler 10 { 11 12 #region GhostScript Import 13 /// <summary>创建Ghostscript的实例 14 /// This instance is passed to most other gsapi functions. 15 /// The caller_handle will be provided to callback functions. 16 /// At this stage, Ghostscript supports only one instance. </summary> 17 /// <param name="pinstance"></param> 18 /// <param name="caller_handle"></param> 19 /// <returns></returns> 20 [DllImport("gsdll32.dll", EntryPoint = "gsapi_new_instance")] 21 private static extern int gsapi_new_instance(out IntPtr pinstance, IntPtr caller_handle); 22 /// <summary>This is the important function that will perform the conversion 23 /// 24 /// </summary> 25 /// <param name="instance"></param> 26 /// <param name="argc"></param> 27 /// <param name="argv"></param> 28 /// <returns></returns> 29 [DllImport("gsdll32.dll", EntryPoint = "gsapi_init_with_args")] 30 private static extern int gsapi_init_with_args(IntPtr instance, int argc, IntPtr argv); 31 /// <summary> 32 /// Exit the interpreter. 33 /// This must be called on shutdown if gsapi_init_with_args() has been called, 34 /// and just before gsapi_delete_instance(). 35 /// 退出 36 /// </summary> 37 /// <param name="instance"></param> 38 /// <returns></returns> 39 [DllImport("gsdll32.dll", EntryPoint = "gsapi_exit")] 40 private static extern int gsapi_exit(IntPtr instance); 41 /// <summary> 42 /// Destroy an instance of Ghostscript. 43 /// Before you call this, Ghostscript must have finished. 44 /// If Ghostscript has been initialised, you must call gsapi_exit before gsapi_delete_instance. 45 /// 销毁实例 46 /// </summary> 47 /// <param name="instance"></param> 48 [DllImport("gsdll32.dll", EntryPoint = "gsapi_delete_instance")] 49 private static extern void gsapi_delete_instance(IntPtr instance); 50 #endregion 51 52 #region 变量 53 private string _sDeviceFormat; 54 private int _iWidth; 55 private int _iHeight; 56 private int _iResolutionX; 57 private int _iResolutionY; 58 private int _iJPEGQuality; 59 private Boolean _bFitPage; 60 private IntPtr _objHandle; 61 #endregion 62 63 #region 属性 64 /// <summary> 65 /// 输出格式 66 /// </summary> 67 public string OutputFormat 68 { 69 get { return _sDeviceFormat; } 70 set { _sDeviceFormat = value; } 71 } 72 /// <summary> 73 /// 74 /// </summary> 75 public int Width 76 { 77 get { return _iWidth; } 78 set { _iWidth = value; } 79 } 80 /// <summary> 81 /// 82 /// </summary> 83 public int Height 84 { 85 get { return _iHeight; } 86 set { _iHeight = value; } 87 } 88 /// <summary> 89 /// 90 /// </summary> 91 public int ResolutionX 92 { 93 get { return _iResolutionX; } 94 set { _iResolutionX = value; } 95 } 96 /// <summary> 97 /// 98 /// </summary> 99 public int ResolutionY 100 { 101 get { return _iResolutionY; } 102 set { _iResolutionY = value; } 103 } 104 /// <summary> 105 /// 106 /// </summary> 107 public Boolean FitPage 108 { 109 get { return _bFitPage; } 110 set { _bFitPage = value; } 111 } 112 /// <summary>Quality of compression of JPG 113 /// Jpeg文档质量 114 /// </summary> 115 public int JPEGQuality 116 { 117 get { return _iJPEGQuality; } 118 set { _iJPEGQuality = value; } 119 } 120 #endregion 121 122 #region 初始化(实例化对象) 123 /// <summary> 124 /// 125 /// </summary> 126 /// <param name="objHandle"></param> 127 public GhostscriptHandler(IntPtr objHandle) 128 { 129 _objHandle = objHandle; 130 } 131 public GhostscriptHandler() 132 { 133 _objHandle = IntPtr.Zero; 134 } 135 #endregion 136 137 #region 字符串处理 138 /// <summary> 139 /// 转换Unicode字符串到Ansi字符串 140 /// </summary> 141 /// <param name="str">Unicode字符串</param> 142 /// <returns>Ansi字符串(字节数组格式)</returns> 143 private byte[] StringToAnsiZ(string str) 144 { 145 //' Convert a Unicode string to a null terminated Ansi string for Ghostscript. 146 //' The result is stored in a byte array. Later you will need to convert 147 //' this byte array to a pointer with GCHandle.Alloc(XXXX, GCHandleType.Pinned) 148 //' and GSHandle.AddrOfPinnedObject() 149 int intElementCount; 150 int intCounter; 151 byte[] aAnsi; 152 byte bChar; 153 intElementCount = str.Length; 154 aAnsi = new byte[intElementCount + 1]; 155 for (intCounter = 0; intCounter < intElementCount; intCounter++) 156 { 157 bChar = (byte)str[intCounter]; 158 aAnsi[intCounter] = bChar; 159 } 160 aAnsi[intElementCount] = 0; 161 return aAnsi; 162 } 163 #endregion 164 165 #region 转换文件 166 /// <summary> 167 /// 转换文件 168 /// </summary> 169 /// <param name="inputFile">输入的PDF文件路径</param> 170 /// <param name="outputFile">输出的Jpeg图片路径</param> 171 /// <param name="firstPage">第一页</param> 172 /// <param name="lastPage">最后一页</param> 173 /// <param name="deviceFormat">格式(文件格式)</param> 174 /// <param name="width">宽度</param> 175 /// <param name="height">高度</param> 176 public void Convert(string inputFile, string outputFile, 177 int firstPage, int lastPage, string deviceFormat, int width, int height) 178 { 179 //判断文件是否存在 180 if (!System.IO.File.Exists(inputFile)) 181 { 182 LogHandler.LogWrite(string.Format("文件{0}不存在", inputFile)); 183 return; 184 } 185 int intReturn; 186 IntPtr intGSInstanceHandle; 187 object[] aAnsiArgs; 188 IntPtr[] aPtrArgs; 189 GCHandle[] aGCHandle; 190 int intCounter; 191 int intElementCount; 192 IntPtr callerHandle; 193 GCHandle gchandleArgs; 194 IntPtr intptrArgs; 195 string[] sArgs = GetGeneratedArgs(inputFile, outputFile, 196 firstPage, lastPage, deviceFormat, width, height); 197 // Convert the Unicode strings to null terminated ANSI byte arrays 198 // then get pointers to the byte arrays. 199 intElementCount = sArgs.Length; 200 aAnsiArgs = new object[intElementCount]; 201 aPtrArgs = new IntPtr[intElementCount]; 202 aGCHandle = new GCHandle[intElementCount]; 203 // Create a handle for each of the arguments after 204 // they've been converted to an ANSI null terminated 205 // string. Then store the pointers for each of the handles 206 for (intCounter = 0; intCounter < intElementCount; intCounter++) 207 { 208 aAnsiArgs[intCounter] = StringToAnsiZ(sArgs[intCounter]); 209 aGCHandle[intCounter] = GCHandle.Alloc(aAnsiArgs[intCounter], GCHandleType.Pinned); 210 aPtrArgs[intCounter] = aGCHandle[intCounter].AddrOfPinnedObject(); 211 } 212 // Get a new handle for the array of argument pointers 213 gchandleArgs = GCHandle.Alloc(aPtrArgs, GCHandleType.Pinned); 214 intptrArgs = gchandleArgs.AddrOfPinnedObject(); 215 intReturn = gsapi_new_instance(out intGSInstanceHandle, _objHandle); 216 callerHandle = IntPtr.Zero; 217 try 218 { 219 intReturn = gsapi_init_with_args(intGSInstanceHandle, intElementCount, intptrArgs); 220 } 221 catch (Exception ex) 222 { 223 LogHandler.LogWrite(string.Format("PDF文件{0}转换失败.\n错误:{1}",new string[]{inputFile,ex.ToString()})); 224 225 } 226 finally 227 { 228 for (intCounter = 0; intCounter < intReturn; intCounter++) 229 { 230 aGCHandle[intCounter].Free(); 231 } 232 gchandleArgs.Free(); 233 gsapi_exit(intGSInstanceHandle); 234 gsapi_delete_instance(intGSInstanceHandle); 235 } 236 } 237 #endregion 238 239 #region 转换文件 240 /// <summary> 241 /// 242 /// </summary> 243 /// <param name="inputFile"></param> 244 /// <param name="outputFile"></param> 245 /// <param name="firstPage"></param> 246 /// <param name="lastPage"></param> 247 /// <param name="deviceFormat"></param> 248 /// <param name="width"></param> 249 /// <param name="height"></param> 250 /// <returns></returns> 251 private string[] GetGeneratedArgs(string inputFile, string outputFile, 252 int firstPage, int lastPage, string deviceFormat, int width, int height) 253 { 254 this._sDeviceFormat = deviceFormat; 255 this._iResolutionX = width; 256 this._iResolutionY = height; 257 // Count how many extra args are need - HRangel - 11/29/2006, 3:13:43 PM 258 ArrayList lstExtraArgs = new ArrayList(); 259 if (_sDeviceFormat == "jpg" && _iJPEGQuality > 0 && _iJPEGQuality < 101) 260 lstExtraArgs.Add("-dJPEGQ=" + _iJPEGQuality); 261 if (_iWidth > 0 && _iHeight > 0) 262 lstExtraArgs.Add("-g" + _iWidth + "x" + _iHeight); 263 if (_bFitPage) 264 lstExtraArgs.Add("-dPDFFitPage"); 265 if (_iResolutionX > 0) 266 { 267 if (_iResolutionY > 0) 268 lstExtraArgs.Add("-r" + _iResolutionX + "x" + _iResolutionY); 269 else 270 lstExtraArgs.Add("-r" + _iResolutionX); 271 } 272 // Load Fixed Args - HRangel - 11/29/2006, 3:34:02 PM 273 int iFixedCount = 17; 274 int iExtraArgsCount = lstExtraArgs.Count; 275 string[] args = new string[iFixedCount + lstExtraArgs.Count]; 276 /* 277 // Keep gs from writing information to standard output 278 "-q", 279 "-dQUIET", 280 281 "-dPARANOIDSAFER", // Run this command in safe mode 282 "-dBATCH", // Keep gs from going into interactive mode 283 "-dNOPAUSE", // Do not prompt and pause for each page 284 "-dNOPROMPT", // Disable prompts for user interaction 285 "-dMaxBitmap=500000000", // Set high for better performance 286 287 // Set the starting and ending pages 288 String.Format("-dFirstPage={0}", firstPage), 289 String.Format("-dLastPage={0}", lastPage), 290 291 // Configure the output anti-aliasing, resolution, etc 292 "-dAlignToPixels=0", 293 "-dGridFitTT=0", 294 "-sDEVICE=jpeg", 295 "-dTextAlphaBits=4", 296 "-dGraphicsAlphaBits=4", 297 */ 298 args[0] = "pdf2img";//this parameter have little real use 299 args[1] = "-dNOPAUSE";//I don't want interruptions 300 args[2] = "-dBATCH";//stop after 301 //args[3]="-dSAFER"; 302 args[3] = "-dPARANOIDSAFER"; 303 args[4] = "-sDEVICE=" + _sDeviceFormat;//what kind of export format i should provide 304 args[5] = "-q"; 305 args[6] = "-dQUIET"; 306 args[7] = "-dNOPROMPT"; 307 args[8] = "-dMaxBitmap=500000000"; 308 args[9] = String.Format("-dFirstPage={0}", firstPage); 309 args[10] = String.Format("-dLastPage={0}", lastPage); 310 args[11] = "-dAlignToPixels=0"; 311 args[12] = "-dGridFitTT=0"; 312 args[13] = "-dTextAlphaBits=4"; 313 args[14] = "-dGraphicsAlphaBits=4"; 314 //For a complete list watch here: 315 //http://pages.cs.wisc.edu/~ghost/doc/cvs/Devices.htm 316 //Fill the remaining parameters 317 for (int i = 0; i < iExtraArgsCount; i++) 318 { 319 args[15 + i] = (string)lstExtraArgs[i]; 320 } 321 //Fill outputfile and inputfile 322 args[15 + iExtraArgsCount] = string.Format("-sOutputFile={0}", outputFile); 323 args[16 + iExtraArgsCount] = string.Format("{0}", inputFile); 324 return args; 325 } 326 #endregion 327 328 329 } 330 }
OCR,识别Image代码(AsPrise辅助类)
1 using System; 2 using System.Collections.Generic; 3 using System.Runtime.InteropServices; 4 using System.Text; 5 6 namespace PDFCaptureService 7 { 8 public static class AspriseOCRHandler 9 { 10 #region 外部引用 11 [DllImport("AspriseOCR.dll", EntryPoint = "OCR", CallingConvention = CallingConvention.Cdecl)] 12 public static extern IntPtr OCR(string file, int type); 13 [DllImport("AspriseOCR.dll", EntryPoint = "OCRpart", CallingConvention = CallingConvention.Cdecl)] 14 static extern IntPtr OCRpart(string file, int type, int startX, int 15 startY, int width, int height); 16 [DllImport("AspriseOCR.dll", EntryPoint = "OCRBarCodes", CallingConvention = CallingConvention.Cdecl)] 17 static extern IntPtr OCRBarCodes(string file, int type); 18 [DllImport("AspriseOCR.dll", EntryPoint = "OCRpartBarCodes", CallingConvention = CallingConvention.Cdecl)] 19 static extern IntPtr OCRpartBarCodes(string file, int type, int 20 startX, int startY, int width, int height); 21 #endregion 22 23 /// <summary> 24 /// 25 /// </summary> 26 /// <param name="fileName"></param> 27 /// <returns></returns> 28 public static string ReadImage(string fileName) 29 { 30 IntPtr ptrFileContent = OCR(fileName, -1); 31 string fileContent = Marshal.PtrToStringAnsi(ptrFileContent); 32 // 33 return fileContent; 34 } 35 } 36 }
调用示例
1 GhostscriptHandler ghostscriptHandler = new GhostscriptHandler(); 2 string tempJpgFileName = string.Format(GhostScriptImageName, Guid.NewGuid().ToString()); 3 int pdfPageCount = ITextSharpHandler.GetPdfPageCount(fileName); 4 ghostscriptHandler.Convert(fileName, tempJpgFileName, 1, pdfPageCount, "jpeg", 100, 100); 5 fileContent = AspriseOCRHandler.ReadImage(fileName);