纯文本格式的pdf解析

纯文本格式的pdf解析出来了,解析纯文本的代码如下:
package pdfbox;
import java.io.ByteArrayOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.OutputStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.*;

public class pdf2 {
public static String getText(String file){
String s="";
String pdffile=file;
PDDocument pdfdoc=null;
try {
pdfdoc=PDDocument.load(pdffile);
PDFTextStripper stripper=new PDFTextStripper();
s=stripper.getText(pdfdoc);  
 
} catch (IOException e) {  
// TODO Auto-generated catch block
e.printStackTrace();
}
finally{
try {
if (pdfdoc!=null){  
pdfdoc.close();

}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}  

return s;  
}
public static void toTextFile(String doc,String filename) throws Exception{
String pdffile=doc;
PDDocument pdfdoc=PDDocument.load(doc);
try {
pdfdoc=PDDocument.load(pdffile);
PDFTextStripper stripper=new PDFTextStripper();
PrintWriter pw=new PrintWriter(new FileWriter(filename));
stripper.writeText(pdfdoc, pw);
 
} catch (IOException e) {  
// TODO Auto-generated catch block
e.printStackTrace();
}
finally{
try {
if (pdfdoc!=null){  
pdfdoc.close();

}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}  

 
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
String sc=getText("E:/solution.pdf");  
System.out.print(sc);
toTextFile("E:/solution.pdf","E:/solution.txt");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
 
}

}

org.apache.pdfbox.pdmodel.PDDocument  
用的是pdfbox-1.2.1.zip

你可能感兴趣的:(java,apache)