java解析pdf,word,ppt,excel

用pdfbox的jar包来解析pdf:

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.OutputStreamWriter;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class Pdf2text { 
    public static String getTxt(File f) throws Exception {   
        String ts="";   
        try{   
            String temp = "";   
            PDDocument pdfdocument = PDDocument.load(f);
          
            ByteArrayOutputStream out = new ByteArrayOutputStream();   
            OutputStreamWriter writer = new OutputStreamWriter(out);   
            PDFTextStripper stripper = new PDFTextStripper();
          
            stripper.writeText(pdfdocument.getDocument(), writer);  
          
            pdfdocument.close();   
            out.close();   
            writer.close();   
            byte[] contents = out.toByteArray();   
            ts = new String(contents);   
            System.out.println(f.getName() + "length is:" + contents.length + "\n");   
        }catch(Exception e){   
            e.printStackTrace();   
        }   
        finally{   
            return ts;   
        }   
    } 
     
    public static void main(String[] args){ 
        File file = new File("E:/600536_2008_zzy.pdf"); 
        try { 
            System.out.println(Pdf2text.getTxt(file)); 
        } catch (Exception e) { 
            // TODO 自动生成 catch 块 
            e.printStackTrace(); 
        } 
    } 
}
======================

word,excel和ppt都用POI的jar包来解析:

    import java.io.File;  
     
    import org.apache.poi.POITextExtractor;  
    import org.apache.poi.extractor.ExtractorFactory;  
     
    public class DocxParser {  
     
        /**
          * @param args
          */ 
        public static void main(String[] args) {  
            try {  
                 File inputFile = new File("D:\\test.docx");  
                //File inputFile = new File("D:\\test.pptx");  
                //File inputFile = new File("D:\\test.xlsx");  
                //File inputFile = new File("D:\\test.xls");  
                //File inputFile = new File("D:\\test.doc");  
                //File inputFile = new File("D:\\test.ppt");  
                 POITextExtractor extractor = ExtractorFactory  
                         .createExtractor(inputFile);  
                 System.out.println("Document Text: ");  
                 System.out.println("====================");  
                 System.out.println(extractor.getText());  
                 System.out.println("====================");  
             } catch (Exception ex) {  
                 ex.printStackTrace();  
             }  
         }  
     
    } 

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.apache.poi.hwpf.extractor.WordExtractor;

public class Word2text {

public static void main(String[] args) {
   File file = new File("E:\\2009.doc");
   try {
    FileInputStream fis = new FileInputStream(file);
    WordExtractor wordExtractor = new WordExtractor(fis);
    System.out.println("【 使用getText()方法提取的Word文件的内容如下所示:】");
    System.out.println(wordExtractor.getText());
   } catch (FileNotFoundException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }
}
}

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;

import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;

public class Ppt2text {

/**
* @param args
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
   File file = new File("E:\\1025681983.ppt");
   InputStream fis = new FileInputStream(file);
   try {
    getDocument(fis);
  
   } catch (Exception e) {
  
    e.printStackTrace();
   }
}

public static void getDocument(InputStream is) throws Exception {
   StringBuffer content = new StringBuffer("");
   try {
    SlideShow ss = new SlideShow(new HSLFSlideShow(is));// is
                 // 为文件的InputStream,建立SlideShow
    Slide[] slides = ss.getSlides();// 获得每一张幻灯片
    for (int i = 0; i < slides.length; i++) {
     TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
     for (int j = 0; j < t.length; j++) {
      content.append(t[j].getText());// 这里会将文字内容加到content中去
     }
     content.append(slides[i].getTitle());
    }
    String str = new String(content);
    System.out.println(str.toString());
  
   } catch (Exception ex) {
    System.out.println(ex.toString());
   }
 
}

}
=============

对excel的解析也可以用jxl的jar包来解析:

import java.io.File;

import jxl.Cell;
import jxl.CellType;
import jxl.DateCell;
import jxl.NumberCell;
import jxl.Sheet;
import jxl.Workbook;


public class Excel2text {
public static void main(String args[]) {

   try {

    Workbook workbook = null;

    try {
     workbook = Workbook.getWorkbook(new File("e:\\Dealerlist_3.xls"));
    } catch (Exception e) {
     throw new Exception("file to import not found!");
    }

    Sheet sheet = workbook.getSheet(0);
    Cell cell = null;

    int columnCount = 3;
    int rowCount = sheet.getRows();
    for (int i = 0; i < rowCount; i++) {
     for (int j = 0; j < columnCount; j++) {
      // 注意,这里的两个参数,第一个是表示列的,第二才表示行
      cell = sheet.getCell(j, i);
      // 要根据单元格的类型分别做处理,否则格式化过的内容可能会不正确
      if (cell.getType() == CellType.NUMBER) {
       System.out.print(((NumberCell) cell).getValue());
      } else if (cell.getType() == CellType.DATE) {
       System.out.print(((DateCell) cell).getDate());
      } else {
       System.out.print(cell.getContents());
      }

      // System.out.print(cell.getContents());
      System.out.print("\t");
     }
     System.out.print("\n");
    }
    // 关闭它,否则会有内存泄露
    workbook.close();
   } catch (Exception e) {

   }

}
}

import java.io.*;
import jxl.*;
import jxl.write.*;
import jxl.format.*;

public class Text2Excel {
public static void main(String args[]) {

   try {

    File tempFile = new File("e:" + java.io.File.separator
      + "output00.xls");
    System.out.println("e:" + java.io.File.separator + "output00.xls");

    WritableWorkbook workbook = Workbook.createWorkbook(tempFile);
    WritableSheet sheet = workbook.createSheet("TestCreateExcel", 0);

    // 一些临时变量,用于写到excel中
    Label l = null;
    jxl.write.Number n = null;
    jxl.write.DateTime d = null;

    // 预定义的一些字体和格式,同一个Excel中最好不要有太多格式
    WritableFont headerFont = new WritableFont(WritableFont.ARIAL, 12,
      WritableFont.BOLD, false, UnderlineStyle.NO_UNDERLINE,
      jxl.format.Colour.BLUE);
    WritableCellFormat headerFormat = new WritableCellFormat(headerFont);

    WritableFont titleFont = new WritableFont(WritableFont.ARIAL, 10,
      WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE,
      jxl.format.Colour.RED);
    WritableCellFormat titleFormat = new WritableCellFormat(titleFont);

    WritableFont detFont = new WritableFont(WritableFont.ARIAL, 10,
      WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE,
      jxl.format.Colour.BLACK);
    WritableCellFormat detFormat = new WritableCellFormat(detFont);

    NumberFormat nf = new NumberFormat("0.00000"); // 用于Number的格式
    WritableCellFormat priceFormat = new WritableCellFormat(detFont, nf);

    DateFormat df = new DateFormat("yyyy-MM-dd");// 用于日期的
    WritableCellFormat dateFormat = new WritableCellFormat(detFont, df);

    // 剩下的事情,就是用上面的内容和格式创建一些单元格,再加到sheet中
    l = new Label(0, 0, "用于测试的Excel文件", headerFormat);
    sheet.addCell(l);

    // add Title
    int column = 0;
    l = new Label(column++, 2, "标题", titleFormat);
    sheet.addCell(l);
    l = new Label(column++, 2, "日期", titleFormat);
    sheet.addCell(l);
    l = new Label(column++, 2, "货币", titleFormat);
    sheet.addCell(l);
    l = new Label(column++, 2, "价格", titleFormat);
    sheet.addCell(l);

    // add detail
    int i = 0;
    column = 0;
    l = new Label(column++, i + 3, "标题 " + i, detFormat);
    sheet.addCell(l);
    d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat);
    sheet.addCell(d);
    l = new Label(column++, i + 3, "CNY", detFormat);
    sheet.addCell(l);
    n = new jxl.write.Number(column++, i + 3, 5.678, priceFormat);
    sheet.addCell(n);

    i++;
    column = 0;
    l = new Label(column++, i + 3, "标题 " + i, detFormat);
    sheet.addCell(l);
    d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat);
    sheet.addCell(d);
    l = new Label(column++, i + 3, "SGD", detFormat);
    sheet.addCell(l);
    n = new jxl.write.Number(column++, i + 3, 98832, priceFormat);
    sheet.addCell(n);

    // 设置列的宽度
    column = 0;
    sheet.setColumnView(column++, 20);
    sheet.setColumnView(column++, 20);
    sheet.setColumnView(column++, 10);
    sheet.setColumnView(column++, 20);

    workbook.write();
    workbook.close();
   } catch (Exception e) {
            e.printStackTrace();
   }

}
}

你可能感兴趣的:(java,apache,单元测试,Excel,J#)