使用pdfbox提取pdf文件中的字符信息

前段时间使用了一下pdfbox(1.6.0)的文本提取功能,发现很好用。但是能给出的比较准确的结果只有行的粒度,后来又有了定位文章题目、章节标题、自然段落的需求,pdfbox目前好像没有这方面的支持(尤其是对于中文的期刊论文而言,排版情况很复杂,如一页中存在多篇文章混排等),只能先从比较低层次的字符提取入手,但需要保留字符的位置、大小、字体等信息,pdfbox源码中的一个小例子PrintTextLocatins比较接近,但未给出字体信息,本人这里仿照它重新定义了一个PrintTextLocatins2类,代码如下:

import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.io.IOException;

public class PrintTextLocatins2 extends PDFTextStripper{
	
	private static int BOLD_F_NUM = 2;
	private static String[] BOLD_FLAGS = {"Bold", "CAJ FNT04"};
	private static int ITALIC_F_NUM = 2;
	private static String[] ITALIC_FLAGS = {"Italic", "CAJ FNT03"};

	private static boolean IsBold(String font)
	{
		int i;
		for (i = 0; i < BOLD_F_NUM; i++)
			if (font.contains(BOLD_FLAGS[i]))
				return true;
		return false;
	}
	
	private static boolean IsItalic(String font)
	{
		int i;
		for (i = 0; i < ITALIC_F_NUM; i++)
			if (font.contains(ITALIC_FLAGS[i]))
				return true;
		return false;
	}
	
    public PrintTextLocatins2() throws IOException
    {
        super.setSortByPosition( false );
    }
	
    protected void processTextPosition( TextPosition text )
    {
    	//PDFontDescriptor fd = text.getFont().getFontDescriptor();
    	
        System.out.println( "String[" + 
        		text.getXDirAdj() + "," +
                text.getYDirAdj() + 
                " fs=" + text.getFontSize() + 
                " xscale=" + text.getXScale() + 
                " height=" + text.getHeightDir() + 
                " space=" + text.getWidthOfSpace() + 
                " width=" + text.getWidthDirAdj() + 
                " subfont=" + text.getFont().getSubType() + 
                " basefont=" + text.getFont().getBaseFont() +
                " isBold=" + IsBold(text.getFont().getBaseFont()) +
                " isItalic=" + IsItalic(text.getFont().getBaseFont()) +
                "]" + 
                text.getCharacter() );
    }

    /**
     * This will print the usage for this document.
     */
    private static void usage()
    {
        System.err.println( "Usage: java org.apache.pdfbox.examples.pdmodel.PrintTextLocations " );
    }
}

使用方法:

public class CLayoutTest {

	public void printTextLocations(String file) throws IOException
	{
		String pdfFile = file;

		PDDocument document = null;
		int file_len;
		
		try{
			document = PDDocument.load(pdfFile, true);
            if( document.isEncrypted() )
            {
                try
                {
                    document.decrypt( "" );
                }
                catch(InvalidPasswordException e )
                {
                    System.err.println( "Error: Document is encrypted with a password." );
                    System.exit( 1 );
                }
            }
            
            file_len = pdfFile.length();
			
            PrintStream old = System.out;
		    try
		    {
		        PrintStream out = new PrintStream(pdfFile.substring(0, file_len - 4) + "_layout.txt");
		        System.setOut(out);
		    }
		    catch(FileNotFoundException e)
		    {
		        e.printStackTrace();
		    }
			
            PrintTextLocatins2 printer = new PrintTextLocatins2();
            
            List allPages = document.getDocumentCatalog().getAllPages();
            for( int i=0; i" );
                
                PDPage page = (PDPage)allPages.get( i );
                PDStream contents = page.getContents();
                if( contents != null )
                {
                   printer.processStream(page, page.findResources(), page.getContents().getStream());
                }
                
                System.out.println("" );
            }
            System.setOut(old);
		}
		catch(Exception e){
			System.out.println(e.toString());
		}
		finally{
			if (document != null){
				document.close();
			}
		}
	}
	
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		CLayoutTest pb_t = new CLayoutTest();
		try{
			pb_t.printTextLocations("E:\\eclipse\\workspace\\pdf\\单剂量冰片及单剂量复方制剂中冰片的药物动力学比较研究.pdf");
		}
		catch(Exception e){
			System.out.println(e.toString());
		}
	}

}





你可能感兴趣的:(pdfbox)