用pdfbox的jar包来解析pdf:
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.OutputStreamWriter;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
public class Pdf2text {
public static String getTxt(File f) throws Exception {
String ts="";
try{
String temp = "";
PDDocument pdfdocument = PDDocument.load(f);
ByteArrayOutputStream out = new ByteArrayOutputStream();
OutputStreamWriter writer = new OutputStreamWriter(out);
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdfdocument.getDocument(), writer);
pdfdocument.close();
out.close();
writer.close();
byte[] contents = out.toByteArray();
ts = new String(contents);
System.out.println(f.getName() + "length is:" + contents.length + "\n");
}catch(Exception e){
e.printStackTrace();
}
finally{
return ts;
}
}
public static void main(String[] args){
File file = new File("E:/600536_2008_zzy.pdf");
try {
System.out.println(Pdf2text.getTxt(file));
} catch (Exception e) {
// TODO 自动生成 catch 块
e.printStackTrace();
}
}
}
======================
word,excel和ppt都用POI的jar包来解析:
import java.io.File;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
public class DocxParser {
/**
* @param args
*/
public static void main(String[] args) {
try {
File inputFile = new File("D:\\test.docx");
//File inputFile = new File("D:\\test.pptx");
//File inputFile = new File("D:\\test.xlsx");
//File inputFile = new File("D:\\test.xls");
//File inputFile = new File("D:\\test.doc");
//File inputFile = new File("D:\\test.ppt");
POITextExtractor extractor = ExtractorFactory
.createExtractor(inputFile);
System.out.println("Document Text: ");
System.out.println("====================");
System.out.println(extractor.getText());
System.out.println("====================");
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class Word2text {
public static void main(String[] args) {
File file = new File("E:\\2009.doc");
try {
FileInputStream fis = new FileInputStream(file);
WordExtractor wordExtractor = new WordExtractor(fis);
System.out.println("【 使用getText()方法提取的Word文件的内容如下所示:】");
System.out.println(wordExtractor.getText());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;
public class Ppt2text {
/**
* @param args
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
File file = new File("E:\\1025681983.ppt");
InputStream fis = new FileInputStream(file);
try {
getDocument(fis);
} catch (Exception e) {
e.printStackTrace();
}
}
public static void getDocument(InputStream is) throws Exception {
StringBuffer content = new StringBuffer("");
try {
SlideShow ss = new SlideShow(new HSLFSlideShow(is));// is
// 为文件的InputStream,建立SlideShow
Slide[] slides = ss.getSlides();// 获得每一张幻灯片
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());// 这里会将文字内容加到content中去
}
content.append(slides[i].getTitle());
}
String str = new String(content);
System.out.println(str.toString());
} catch (Exception ex) {
System.out.println(ex.toString());
}
}
}
=============
对excel的解析也可以用jxl的jar包来解析:
import java.io.File;
import jxl.Cell;
import jxl.CellType;
import jxl.DateCell;
import jxl.NumberCell;
import jxl.Sheet;
import jxl.Workbook;
public class Excel2text {
public static void main(String args[]) {
try {
Workbook workbook = null;
try {
workbook = Workbook.getWorkbook(new File("e:\\Dealerlist_3.xls"));
} catch (Exception e) {
throw new Exception("file to import not found!");
}
Sheet sheet = workbook.getSheet(0);
Cell cell = null;
int columnCount = 3;
int rowCount = sheet.getRows();
for (int i = 0; i < rowCount; i++) {
for (int j = 0; j < columnCount; j++) {
// 注意,这里的两个参数,第一个是表示列的,第二才表示行
cell = sheet.getCell(j, i);
// 要根据单元格的类型分别做处理,否则格式化过的内容可能会不正确
if (cell.getType() == CellType.NUMBER) {
System.out.print(((NumberCell) cell).getValue());
} else if (cell.getType() == CellType.DATE) {
System.out.print(((DateCell) cell).getDate());
} else {
System.out.print(cell.getContents());
}
// System.out.print(cell.getContents());
System.out.print("\t");
}
System.out.print("\n");
}
// 关闭它,否则会有内存泄露
workbook.close();
} catch (Exception e) {
}
}
}
import java.io.*;
import jxl.*;
import jxl.write.*;
import jxl.format.*;
public class Text2Excel {
public static void main(String args[]) {
try {
File tempFile = new File("e:" + java.io.File.separator
+ "output00.xls");
System.out.println("e:" + java.io.File.separator + "output00.xls");
WritableWorkbook workbook = Workbook.createWorkbook(tempFile);
WritableSheet sheet = workbook.createSheet("TestCreateExcel", 0);
// 一些临时变量,用于写到excel中
Label l = null;
jxl.write.Number n = null;
jxl.write.DateTime d = null;
// 预定义的一些字体和格式,同一个Excel中最好不要有太多格式
WritableFont headerFont = new WritableFont(WritableFont.ARIAL, 12,
WritableFont.BOLD, false, UnderlineStyle.NO_UNDERLINE,
jxl.format.Colour.BLUE);
WritableCellFormat headerFormat = new WritableCellFormat(headerFont);
WritableFont titleFont = new WritableFont(WritableFont.ARIAL, 10,
WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE,
jxl.format.Colour.RED);
WritableCellFormat titleFormat = new WritableCellFormat(titleFont);
WritableFont detFont = new WritableFont(WritableFont.ARIAL, 10,
WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE,
jxl.format.Colour.BLACK);
WritableCellFormat detFormat = new WritableCellFormat(detFont);
NumberFormat nf = new NumberFormat("0.00000"); // 用于Number的格式
WritableCellFormat priceFormat = new WritableCellFormat(detFont, nf);
DateFormat df = new DateFormat("yyyy-MM-dd");// 用于日期的
WritableCellFormat dateFormat = new WritableCellFormat(detFont, df);
// 剩下的事情,就是用上面的内容和格式创建一些单元格,再加到sheet中
l = new Label(0, 0, "用于测试的Excel文件", headerFormat);
sheet.addCell(l);
// add Title
int column = 0;
l = new Label(column++, 2, "标题", titleFormat);
sheet.addCell(l);
l = new Label(column++, 2, "日期", titleFormat);
sheet.addCell(l);
l = new Label(column++, 2, "货币", titleFormat);
sheet.addCell(l);
l = new Label(column++, 2, "价格", titleFormat);
sheet.addCell(l);
// add detail
int i = 0;
column = 0;
l = new Label(column++, i + 3, "标题 " + i, detFormat);
sheet.addCell(l);
d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat);
sheet.addCell(d);
l = new Label(column++, i + 3, "CNY", detFormat);
sheet.addCell(l);
n = new jxl.write.Number(column++, i + 3, 5.678, priceFormat);
sheet.addCell(n);
i++;
column = 0;
l = new Label(column++, i + 3, "标题 " + i, detFormat);
sheet.addCell(l);
d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat);
sheet.addCell(d);
l = new Label(column++, i + 3, "SGD", detFormat);
sheet.addCell(l);
n = new jxl.write.Number(column++, i + 3, 98832, priceFormat);
sheet.addCell(n);
// 设置列的宽度
column = 0;
sheet.setColumnView(column++, 20);
sheet.setColumnView(column++, 20);
sheet.setColumnView(column++, 10);
sheet.setColumnView(column++, 20);
workbook.write();
workbook.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}