使用pdfbox读取和解析PDF文件,分页头(header),页内容(content),页尾(tail)

导入PDFbox的jar包,我是通过maven,fontbox是支持中文的字体
pom.xml

<dependency>
  <groupId>org.apache.pdfboxgroupId>
  <artifactId>pdfboxartifactId>
  <version>2.0.4version>
dependency>

<dependency>
  <groupId>org.apache.pdfboxgroupId>
  <artifactId>fontboxartifactId>
  <version>2.0.8version>
dependency>
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.TextPosition;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public static void getTextFromPdf() throws IOException {
        String dest = "D:/test.pdf";

        PDDocument document = PDDocument.load(new File(dest));

        // 获取页码
        int pages = document.getNumberOfPages();
        System.out.println("总页:" + pages);

        PDFTextStripper2 stripper2 = new PDFTextStripper2();
        for (int i = 1; i <= pages; i++) {
            stripper2.setStartPage(i);
            stripper2.setEndPage(i);
            stripper2.getText(document);//读取当前页的全部内容

			//这里可以自己for循环处理,我为了过滤PDF page的头和页脚,直接读body
			//注意,body里面遇到表格,图片,会放到ls的最后面,方便处理
            List<List<TextPosition>> ll = stripper2.getCharactersByArticle();
//          List ls = ll.get(0);//读取PDF page的header
            List<TextPosition> ls = ll.get(1);//读取pdf page的body内容
//          List ls = ll.get(2);//读取PDF page的页码部分
            float y = 0;
            int buttom;//每行距离下面一行的距离
            StringBuffer sentence = new StringBuffer();
            for (TextPosition tp : ls) {
                String c = tp.getUnicode();

                //根据高度来判断是否是一句话
                if (y != tp.getY()) {
                    System.out.print(sentence.toString());

                    buttom = (int) (tp.getY() - y);
                    if (buttom > 11 || buttom < -10) {
                        System.out.println();
                    }
                    y = tp.getY();
                    sentence.setLength(0);
                }

                sentence.append(c);

                //特殊处理符号
                if (c.equals("•")) {
                    sentence.append(" ");
                }

                //遇到表格不打印出来
                if (sentence.toString().indexOf("表格 ") == 0 || sentence.toString().indexOf("Table ") == 0) {
                    break;
                }
            }
            if (sentence.length() > 0) {
                System.out.print(sentence.toString());
            }
        }

    }

PDFTextStripper2.java

//这个类是为了读取到TextPosition的数据
public class PDFTextStripper2 extends PDFTextStripper {

    public PDFTextStripper2() throws IOException {
    }

    @Override
    public List<List<TextPosition>> getCharactersByArticle() {
        return super.getCharactersByArticle();
    }
}

你可能感兴趣的:(java)