poi 读取word文档中内容

package com.test.document.util;

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStream;

import org.apache.poi.POIXMLDocument;

import org.apache.poi.POIXMLTextExtractor;

import org.apache.poi.hwpf.extractor.WordExtractor;

import org.apache.poi.openxml4j.opc.OPCPackage;

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

public class ReadWordUtil {

public static void main(String[] args) throws Exception {

String path = "c:\\Oracle安装和配置.doc";

String context = readWord(path);

System.out.println(context);

}

/**

* 读取word 中文本数据,采用poi读取 poi 版本3.8

*

* @param path

*            word文档路径

* @return

*/

public static String readWord(String path) {

InputStream is = null;

String content = "";

String suffix = path.substring(path.lastIndexOf(".") + 1);

try {

if (suffix.equals("doc")) {

// word 2003: 图片不会被读取

is = new FileInputStream(new File(path));

WordExtractor ex = new WordExtractor(is);// is是WORD文件的InputStream

content = ex.getText().trim();

} else if (suffix.equals("docx")) {

// word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后

OPCPackage opcPackage = POIXMLDocument.openPackage(path);

POIXMLTextExtractor extractor = new XWPFWordExtractor(

opcPackage);

content = extractor.getText().trim();

}

} catch (Exception e) {

e.printStackTrace();

} finally {

if (is != null) {

try {

is.close();

} catch (IOException e) {

e.printStackTrace();

}

}

}

return content;

}

}

你可能感兴趣的:(poi 读取word文档中内容)