将PDF文件解析为字符串




将PDF文件解析为字符串

所需jar如下:
     pdfbox-app-1.8.6.jar


import java.io.FileInputStream;
import java.io.IOException;

import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class PdfParser {

	public static String getText(String file) {
		String s = "";
		String pdffile = file;
		PDDocument pdfdoc = null;
		try {
			pdfdoc = PDDocument.load(pdffile);
			PDFTextStripper stripper = new PDFTextStripper("UTF-8");
			s = stripper.getText(pdfdoc);//将其解析为字符串
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (pdfdoc != null) {
					pdfdoc.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return s;
	}

	public static void main(String[] args) throws Exception {
		/*
		 //提供路径,直接解析
				try {
					String sc = getText("E:\\学习资料\\PDF\\dfsdfsd.pdf");
					System.out.print(sc);
				} catch (Exception e) {
					e.printStackTrace();
				}
		*/

		//通过文件流进行解析
		FileInputStream fis = new FileInputStream("E:\\学习资料\\PDF\\dfsdfsd.pdf");
		//        BufferedWriter writer = new BufferedWriter(new FileWriter("D:\\dfsdfsd.txt"));
		PDFParser p = new PDFParser(fis);
		p.parse();
		PDFTextStripper ts = new PDFTextStripper();
		String s = ts.getText(p.getPDDocument());
		//        writer.write(s);
		System.out.println(s);
		//        fis.close();
		//        writer.close();

	}
}

你可能感兴趣的:(字符串)