用Apache POI提取Word文本

编程语言Java

POI版本为3.17 ,jar包可自行去官网下载

 

达到的最终效果是:去除Word中的表格、超文本、页眉、页脚、取出第一张图片存入硬盘并返回地址;同时也对doc直接修改扩展名为docx以及docx直接修改扩展名为doc这两种情况进行处理(通过捕获异常方式)。

我们的项目需求是只要一张图片,你当然可以取任意张。但注意docx文件取出的图片不是顺序的,要想按顺序读出图片请参考:https://www.cnblogs.com/ct-csu/p/8178932.html

参考文献:

http://poi.apache.org/components/document/index.html

http://poi.apache.org/apidocs/dev/org/apache/poi/hwpf/

http://poi.apache.org/apidocs/dev/org/apache/poi/xwpf/

/**
 * @Description: 提取word内容
 * @param @param path
 * @param @return
 * @return String
 * @author lidw
 * @date 2018年12月25日
 */
public String doWordExtract(String path) {
  // String path = "D:\\temp\\temp\\test.doc";
  JSONObject jsonObject = new JSONObject();
  String content = null;
  String text = "";
  File file = new File(path);
  if (file.exists() && file.isFile()) {
    InputStream is = null;
    HWPFDocument doc = null;
    XWPFDocument docx = null;
    POIXMLTextExtractor extractor = null;
    try {
      is = new FileInputStream(file);
      if (path.endsWith(".doc")) {
        try {
          doc = new HWPFDocument(is);
          WordExtractor ex = new WordExtractor(doc);
          String[] Str = ex.getParagraphText();//通过获取段落的方式可以去页眉和页脚
          for (String str : Str) {
            if (str.indexOf("") == -1) {//去表格
              text += ex.stripFields(str);
            }
          }
          jsonObject.put("txt",
              text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
          PicturesTable picturesTable = doc.getPicturesTable();
          List pictures = picturesTable.getAllPictures();
          if (pictures == null || pictures.size() == 0) {
            jsonObject.put("picUrl", "");
          } else {
            // String s = new SimpleDateFormat("yyyyMMdd_HHmmss_").format(new Date());
            Picture picture = pictures.get(0);
            //Linux路径用/,Windows路径用\\,如/home/java_pic/和D:\\Desktop\\doc\\
            String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
                + picture.suggestFileExtension();
            OutputStream out = new FileOutputStream(new File(picUrl));
            picture.writeImageContent(out);
            out.close();
            jsonObject.put("picUrl", picUrl);
          }
        } catch (OfficeXmlFileException e) {// 捕获docx文件直接将扩展名修改为doc造成的异常,按照docx文件解析
          is = new FileInputStream(file);
          docx = new XWPFDocument(is);
          List Str = docx.getParagraphs();//通过获取段落的方式可以去页眉和页脚
          List picUrls = new ArrayList();
          for (XWPFParagraph str : Str) {
            text += str.getText();
            text += "_|_";
          }
          jsonObject.put("txt",
              text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
          List pictures = docx.getAllPictures();
          if (pictures == null || pictures.size() == 0) {
            jsonObject.put("picUrl", "");
          } else {
            XWPFPictureData picture = pictures.get(0);
            byte[] bytev = picture.getData();
            String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
                + picture.suggestFileExtension();
            OutputStream out = new FileOutputStream(new File(picUrl));
            out.write(bytev);
            out.close();
            jsonObject.put("picUrl", picUrl);
          }
        }
      } else if (path.endsWith("docx")) {
        try {
          docx = new XWPFDocument(is);
          List Str = docx.getParagraphs();//通过获取段落的方式可以去页眉和页脚
          List picUrls = new ArrayList();
          for (XWPFParagraph str : Str) {
            text += str.getText();
            text += "_|_";
          }
          jsonObject.put("txt",
              text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
          List pictures = docx.getAllPictures();
          if (pictures == null || pictures.size() == 0) {
            jsonObject.put("picUrl", "");
          } else {
            XWPFPictureData picture = pictures.get(0);
            byte[] bytev = picture.getData();
            String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
                + picture.suggestFileExtension();
            OutputStream out = new FileOutputStream(new File(picUrl));
            out.write(bytev);
            out.close();
            jsonObject.put("picUrl", picUrl);
          }
        } catch (OLE2NotOfficeXmlFileException e) {// 捕获doc文件直接将扩展名修改为docx造成的异常,按照doc文件解析
          is = new FileInputStream(file);
          doc = new HWPFDocument(is);
          WordExtractor ex = new WordExtractor(doc);
          String[] Str = ex.getParagraphText();//通过获取段落的方式可以去页眉和页脚
          for (String str : Str) {
            if (str.indexOf("") == -1) {//去表格
              text += ex.stripFields(str);
            }
          }
          jsonObject.put("txt",
              text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
          PicturesTable picturesTable = doc.getPicturesTable();
          List pictures = picturesTable.getAllPictures();
          if (pictures == null || pictures.size() == 0) {
            jsonObject.put("picUrl", "");
          } else {
            Picture picture = pictures.get(0);
            String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
                + picture.suggestFileExtension();
            OutputStream out = new FileOutputStream(new File(picUrl));
            picture.writeImageContent(out);
            out.close();
            jsonObject.put("picUrl", picUrl);
          }
        }
      } else {
        System.out.println("此文件不是word文件!");
      }
    } catch (FileNotFoundException e) {
    } catch (IOException e) {
    } finally {
      try {
        if (doc != null) {
          doc.close();
        }
        if (extractor != null) {
          extractor.close();
        }
        if (docx != null) {
          docx.close();
        }
        if (is != null) {
          is.close();
        }
      } catch (IOException e) {
      }
    }
  }
  return jsonObject.toString();
}

 

你可能感兴趣的:(实习所得)