poi读取段落demo

package com.ocr.word.poi;

import java.io.FileInputStream;
import java.util.List;

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;

public class ParagraphReader {
    public static void main(String[] args) {
        try {
            FileInputStream fis = new FileInputStream("D:\\word\\aaa.docx");
            XWPFDocument xdoc = new XWPFDocument(OPCPackage.open(fis));

            List paragraphList = xdoc.getParagraphs();

            for (XWPFParagraph paragraph : paragraphList) {

                System.out.println(paragraph.getText());
                System.out.println(paragraph.getAlignment());
                System.out.print(paragraph.getRuns().size());
                System.out.println(paragraph.getStyle());

                // Returns numbering format for this paragraph, eg bullet or lowerLetter.
                System.out.println(paragraph.getNumFmt());
                System.out.println(paragraph.getAlignment());

                System.out.println(paragraph.isWordWrapped());

                System.out.println("********************************************************************");
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
}

poi读取表格简单demo

package com.ocr.word.poi;

import java.io.FileInputStream;
import java.util.Iterator;
import java.util.List;

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;

public class TableReader {
    public static void main(String[] args) {
        try {
            FileInputStream fis = new FileInputStream("D:\\word\\aaa.docx");
            XWPFDocument xdoc = new XWPFDocument(OPCPackage.open(fis));
            Iterator bodyElementIterator = xdoc.getBodyElementsIterator();
            while (bodyElementIterator.hasNext()) {
                IBodyElement element = (IBodyElement) bodyElementIterator.next();

                if ("TABLE".equalsIgnoreCase(element.getElementType().name())) {
                    List tableList = element.getBody().getTables();
                    for (XWPFTable table : tableList) {
                        System.out.println("Total Number of Rows of Table:" + table.getNumberOfRows());
                        for (int i = 0; i < table.getRows().size(); i++) {

                            for (int j = 0; j < table.getRow(i).getTableCells().size(); j++) {
                                System.out.println(table.getRow(i).getCell(j).getText());
                            }
                        }
                    }
                }
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
}

poi读取样式

package com.ocr.word.poi;
import java.io.FileInputStream;
import java.util.Iterator;
import java.util.List;

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;

public class StyleReader {

    public static void main(String[] args) {
        try {
            FileInputStream fis = new FileInputStream("D:\\word\\aaa.docx");
            XWPFDocument xdoc = new XWPFDocument(OPCPackage.open(fis));

            List paragraphList = xdoc.getParagraphs();

            for (XWPFParagraph paragraph : paragraphList) {

                for (XWPFRun rn : paragraph.getRuns()) {

                    System.out.println(rn.isBold());
                    System.out.println(rn.isHighlighted());
                    System.out.println(rn.isCapitalized());
                    System.out.println(rn.getFontSize());
                }

                System.out.println("********************************************************************");
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }

    }

}

poi读取doc和docx文本demo

package com.ocr.word;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.springframework.util.FileCopyUtils;

public class Readword {

    // 全⻆角半⻆角符号转化之间的间隔
    public static final char DBC_SBC_STEP = 65248;
    private final static String RESULT_DATA = "data";
    private final static String RESULT_ANGLE = "angle";
    private final static String RESULT_WORDS_INFO = "prism_wordsInfo";

    /**
     * 字符串转unicode
     * 
     * @param str
     * @return
     */
    public static String stringToUnicode(String str) {
        StringBuffer sb = new StringBuffer();
        char[] c = str.toCharArray();
        for (int i = 0; i < c.length; i++) {
            sb.append("\\u" + Integer.toHexString(c[i]));
        }
        return sb.toString();
    }

    /**
     * 获取段落内容
     * 
     * @param paragraph
     */
    private static void getParagraphText(XWPFParagraph paragraph) {
        // 获取段落中所有内容
        List runs = paragraph.getRuns();
        if (runs.size() == 0) {
            System.out.println("按了回车(新段落)");
            return;
        }
        StringBuffer runText = new StringBuffer();
        for (XWPFRun run : runs) {
            runText.append(run.text());
        }
        if (runText.length() > 0) {
            runText.append(",对齐方式:").append(paragraph.getAlignment().name());
            System.out.println(runText);
        }
    }

    /**
     * 获取表格内容
     * 
     * @param table
     */
    private static void getTabelText(XWPFTable table) {
        List rows = table.getRows();

        for (XWPFTableRow row : rows) {
            List cells = row.getTableCells();
            for (XWPFTableCell cell : cells) {
                // 简单获取内容(简单方式是不能获取字体对齐方式的)
                // System.out.println(cell.getText());
                // 一个单元格可以理解为一个word文档,单元格里也可以加段落与表格
                List paragraphs = cell.getParagraphs();
                for (XWPFParagraph paragraph : paragraphs) {
                    getParagraphText(paragraph);
                }
            }
        }
    }

    public static void main(String[] args) {
        // TODO Auto-generated method stub
        String filePath="D:\\word\\aaa.doc";
        StringBuffer wordMapStr=new StringBuffer();

        Map wordMap = new LinkedHashMap();//创建一个map对象存放word中的内容
        try {
            if(filePath.endsWith(".doc")){ ///判断文件格式
                InputStream fis = new FileInputStream(new File(filePath));
                WordExtractor wordExtractor = new WordExtractor(fis);//使用HWPF组件中WordExtractor类从Word文档中提取文本或段落
                int i=1;
                for(String words : wordExtractor.getParagraphText()){//获取段落内容
                    System.out.println(words);//.replaceAll("", "")
                    wordMap.put("DOC文档,第("+i+")段内容",words);
                    wordMapStr.append(words.replaceAll("", "")+"\n");
                    i++;
                }
                fis.close();
            }
            if(filePath.endsWith(".docx")){
                File uFile = new File("tempFile.docx");//创建一个临时文件
                if(!uFile.exists()){
                    uFile.createNewFile();
                }
                FileCopyUtils.copy(new File(filePath), uFile);//复制文件内容
                OPCPackage opcPackage = POIXMLDocument.openPackage("tempFile.docx");//包含所有POI OOXML文档类的通用功能,打开一个文件包。
                XWPFDocument document = new XWPFDocument(opcPackage);//使用XWPF组件XWPFDocument类获取文档内容
                List paras = document.getParagraphs();
                int i=1;
                for(XWPFParagraph paragraph : paras){
                    String words = paragraph.getText();
                    System.out.println(words);
                    wordMap.put("DOCX文档,第("+i+")段内容",words+"\n");
                    wordMapStr.append(words);
                    i++;
                }
                List it = document.getTables();

                it.forEach(item->{
                    wordMapStr.append(item.getText());
                });
                uFile.delete();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        System.out.println("-->"+wordMapStr.toString());
       System.out.println(wordMap);

    }

}