java中使用itextpdf读取PDF文件,将文字输出到TXT中

依赖包

bcpg-jdk15on-1.60.jar
itextpdf-5.5.13.jar
jxl-2.6.12.jar

测试代码

package com.nantian.pdf;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;

import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.FilteredTextRenderListener;
import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.RegionTextRenderFilter;
import com.itextpdf.text.pdf.parser.RenderFilter;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;

public class PdfTxt {

public static void main(String[] args) throws IOException {
	//输出txt文本
    String outputPath = "C:\\Users\\lwj\\Desktop\\中国银行高风险理财产品合规销售标准用语.txt";
    PrintWriter writer = new PrintWriter(new FileOutputStream(outputPath));
    //读取PDF中的文本
    String fileName = "C:\\Users\\lwj\\Desktop\\中国银行高风险理财产品合规销售标准用语.pdf";

    readPdf(writer, fileName);//直接读全PDF面

    // readPdf_filter(writer,fileName);//读取PDF面的某个区域

}

public static void readPdf(PrintWriter writer,String fileName){
    String pageContent = "";
    try {
        PdfReader reader = new PdfReader(fileName);
        int pageNum = reader.getNumberOfPages();
        for(int i=1;i<=pageNum;i++){
            String textFromPage = PdfTextExtractor.getTextFromPage(reader, i);
            pageContent += textFromPage;//读取第i页的文档内容
        //    pageContent += PdfTextExtractor.getTextFromPage(reader, i);//读取第i页的文档内容
        }
        writer.write(pageContent);
    } catch (Exception e) {
        e.printStackTrace();
    }finally{
        writer.close();
    }
}

public static void readPdf_filter(PrintWriter writer,String fileName){
    String pageContent = "";
    try {
    	//左下角为原点,参数分别是左、下、右、上
       // Rectangle rect = new Rectangle(90, 0, 450, 40);
        Rectangle rect = new Rectangle(0, 0, 450, 850);
        RenderFilter filter = new RegionTextRenderFilter(rect);
        PdfReader reader = new PdfReader(fileName);
        int pageNum = reader.getNumberOfPages();
        TextExtractionStrategy strategy;
        for (int i = 1; i <= pageNum; i++) {
            strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
            pageContent +=PdfTextExtractor.getTextFromPage(reader, i, strategy);
        }
		/*String[] split = pageContent.split(" ");
		for(String ss : split){
			System.out.println(ss.substring(ss.lastIndexOf(":")+1, ss.length()));
		}*/
        writer.write(pageContent);
    } catch (Exception e) {
        e.printStackTrace();
    }finally{
        writer.close();
    }
}

}

你可能感兴趣的:(小功能,itext,java)