PDFBox 解析PDF文档

原文地址


package com.wss.pdfbox;

import java.io.File;  
import java.io.FileInputStream;  
import java.io.InputStream;  
import java.text.SimpleDateFormat;  
import java.util.Calendar;  
import java.util.Iterator;  
import java.util.List;  
import java.util.Map;  
import java.util.Set;  
  
import org.apache.pdfbox.pdmodel.PDDocument;  
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;  
import org.apache.pdfbox.pdmodel.PDDocumentInformation;  
import org.apache.pdfbox.pdmodel.PDPage;  
import org.apache.pdfbox.pdmodel.PDResources;  
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;  
import org.apache.pdfbox.util.PDFTextStripper;  
  
/** 
 * 这里用一句话描述这个类的作用
 * @author  
 * @date 2015-09-17 
 */
public class pdfBoxDemo {  
  
    public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";  
      
    /** 
     * 解析pdf文档信息 
     * @param pdfPath   pdf文档路径 
     * @throws Exception 
     */  
    public static void pdfParse( String pdfPath, String imgSavePath ) throws Exception  
    {  
        InputStream input = null;  
        File pdfFile = new File( pdfPath );  
        PDDocument document = null;  
        try{  
            input = new FileInputStream( pdfFile );  
            //加载 pdf 文档  
            document = PDDocument.load( input );  
              
            /** 文档属性信息 **/  
            PDDocumentInformation info = document.getDocumentInformation();  
            System.out.println( "标题:" + info.getTitle() );  
            System.out.println( "主题:" + info.getSubject() );  
            System.out.println( "作者:" + info.getAuthor() );  
            System.out.println( "关键字:" + info.getKeywords() );  
              
            System.out.println( "应用程序:" + info.getCreator() );  
            System.out.println( "pdf 制作程序:" + info.getProducer() );  
              
            System.out.println( "作者:" + info.getTrapped() );  
              
            System.out.println( "创建时间:" + dateFormat( info.getCreationDate() ));  
            System.out.println( "修改时间:" + dateFormat( info.getModificationDate()));  
          
              
            //获取内容信息  
            PDFTextStripper pts = new PDFTextStripper();  
            String content = pts.getText( document );  
            System.out.println( "内容:" + content );  
              
              
            /** 文档页面信息 **/  
            PDDocumentCatalog cata = document.getDocumentCatalog();  
            List pages = cata.getAllPages();  
            int count = 1;  
            for( int i = 0; i < pages.size(); i++ )  
            {  
                PDPage page = ( PDPage ) pages.get( i );  
                if( null != page )  
                {  
                    PDResources res = page.findResources();  
                      
                    //获取页面图片信息  
                    Map imgs = res.getImages();  
                    if( null != imgs )  
                    {  
                        Set keySet = imgs.keySet();  
                        Iterator it = keySet.iterator();  
                        while( it.hasNext() )  
                        {  
                            Object obj =  it.next();  
                            PDXObjectImage img = ( PDXObjectImage ) imgs.get( obj );  
                            img.write2file( imgSavePath + count );  
                            count++;  
                        }  
                    }  
                }  
            }  
        }catch( Exception e)  
        {  
            throw e;  
        }finally{  
            if( null != input )  
                input.close();  
            if( null != document )  
                document.close();  
        }  
    }  
      
    /** 
     * 获取格式化后的时间信息 
     * @param dar   时间信息 
     * @return 
     * @throws Exception 
     */  
    public static String dateFormat( Calendar calendar ) throws Exception  
    {  
        if( null == calendar )  
            return null;  
        String date = null;  
        try{  
            String pattern = DATE_FORMAT;  
            SimpleDateFormat format = new SimpleDateFormat( pattern );  
            date = format.format( calendar.getTime() );  
        }catch( Exception e )  
        {  
            throw e;  
        }  
        return date == null ? "" : date;  
    }  
      
    public static void main( String [] args ) throws Exception{  
        pdfParse("H:/PDF201508/test.pdf","H:/PDF201508/");  
    }  
} 


你可能感兴趣的:(pdf,pdfbox)