Java 解析 PDF, pdfbox读取PDF内容

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.OutputStreamWriter;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class Pdftext {
 public static String getTxt(File f) throws Exception {
  String ts = "";
  try {
   String temp = "";
   PDDocument pdfdocument = PDDocument.load(f);

   ByteArrayOutputStream out = new ByteArrayOutputStream();
   OutputStreamWriter writer = new OutputStreamWriter(out);
   PDFTextStripper stripper = new PDFTextStripper();

   stripper.writeText(pdfdocument.getDocument(), writer);

   pdfdocument.close();
   out.close();
   writer.close();
   byte[] contents = out.toByteArray();
   ts = new String(contents);
   System.out.println(f.getName() + "length is:" + contents.length
     + "\n");
  } catch (Exception e) {
   e.printStackTrace();
  } finally {
   return ts;
  }
 }

 public static void main(String[] args) throws Exception {
  
     File file = new File("d:/hello.pdf"); 
     System.out.println(Pdftext.getTxt(file));
  
  
/*
  File file = new File("d:/hello.pdf");
  FileInputStream fis = new FileInputStream(file);
  BufferedInputStream bis = new BufferedInputStream(fis);
  PDFParser parser = new PDFParser(bis);

  //
  parser.parse();
  PDDocument document = parser.getPDDocument();

  PDFTextStripper stripper = new PDFTextStripper();
  String s = stripper.getText(document);

  // ////////////
  document.close();// /////////
  bis.close();

  // //////////
  File ff = new File("d:/hello.pdf");
  ff.createNewFile();

  if (ff.exists())

  {
   ff.createNewFile();
  }

  FileWriter fw = new FileWriter(ff);

  BufferedWriter bw = new BufferedWriter(fw);

  bw.write(s);
  bw.close();*/

 }

}

你可能感兴趣的:(Java,Java,网页抓取)