1.明确几个概念:
Range:它表示一个范围,这个范围可以是整个文档,也可以是里面的某一小节(Section),也可以是某一个段落(Paragraph),还可以是拥有共同属性的一段文本(CharacterRun)。
Section:word文档的一个小节,一个word文档可以由多个小节构成。Paragraph:word文档的一个段落,一个小节可以由多个段落构成。
CharacterRun:具有相同属性的一段文本,一个段落可以由多个CharacterRun组成。
Table:一个表格。
ableRow:表格对应的行。
TableCell:表格对应的单元格。
2.依赖包:
org.apache.poi
poi-ooxml
4.1.1
org.apache.poi
poi-scratchpad
4.1.1
3.读取doc型文档
1)使用HWPFDocument 读取
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.*;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.junit.Test;
import java.io.*;
import java.util.List;
public class WordTest {
@Test
public void testReadByDoc() throws Exception {
InputStream is = new FileInputStream("C:\\Users\\阿劼\\Desktop\\11.doc");
HWPFDocument doc = new HWPFDocument(is);
// 输出文本,这步读取不到????
System.out.println("=========================文本信息==========================");
System.out.println("-------------使用getDocumentText()获取文本---------------");
System.out.println(doc.getDocumentText());
System.out.println("-----------------使用getText()获取文本-------------------");
System.out.println(doc.getText());
// 输出书签信息
this.printInfo(doc.getBookmarks());
Range range = doc.getRange();
// range信息
this.printInfo(range);
// 读表格
this.readTable(range);
// 读列表
this.readList(range);
this.closeStream(is);
}
/**
* 关闭输入流
*
* @param is
*/
private void closeStream(InputStream is) {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 输出书签信息
*
* @param bookmarks
*/
private void printInfo(Bookmarks bookmarks) {
int count = bookmarks.getBookmarksCount();
System.out.println("=========================书签信息==========================");
System.out.println("书签数量:" + count);
Bookmark bookmark;
for (int i = 0; i < count; i++) {
bookmark = bookmarks.getBookmark(i);
System.out.println("书签" + (i + 1) + "的名称是:" + bookmark.getName());
System.out.println("开始位置:" + bookmark.getStart());
System.out.println("结束位置:" + bookmark.getEnd());
}
}
private void readTable(Range range) {
System.out.println("=========================表格信息==========================");
//遍历range范围内的table。
TableIterator tableIter = new TableIterator(range);
while (tableIter.hasNext()) {
Table table = tableIter.next();
//开始位置
int start = table.getStartOffset();
//结束位置
int end = table.getEndOffset();
System.out.printf("开始位置%d,结束位置%d\r\n", start, end);
//获取行的数目
int rowNum = table.numRows();
for (int j = 0; j < rowNum; j++) {
//获取每一行
TableRow row = table.getRow(j);
int cellNum = row.numCells();
for (int k = 0; k < cellNum; k++) {
//获取每一列
TableCell cell = row.getCell(k);
// 输出单元格的文本
System.out.printf("第%d行第%d列的内容是: %s", j + 1, k + 1, cell.text().trim());
System.out.println();
}
}
}
}
/**
* 读列表
*
* @param range
*/
private void readList(Range range) {
System.out.println("=========================列表信息==========================");
int num = range.numParagraphs();
for (int i = 0; i < num; i++) {
Paragraph paragraph = range.getParagraph(i);
if (paragraph.isInList()) {
System.out.println("list : " + paragraph.text());
}
}
}
/**
* 输出Range
*
* @param range
*/
private void printInfo(Range range) {
System.out.println("=========================Range信息==========================");
System.out.println("-------------------------段落信息-------------------------");
// 获取段落数
int paraNum = range.numParagraphs();
System.out.println("段落数为 : " + paraNum);
for (int i = 0; i < paraNum; i++) {
System.out.println("段落" + (i + 1) + "内容为:" + range.getParagraph(i).text());
}
System.out.println("-------------------------小节信息-------------------------");
int secNum = range.numSections();
System.out.println("小节数为 : " + paraNum);
System.out.println(secNum);
Section section;
for (int i = 0; i < secNum; i++) {
section = range.getSection(i);
System.out.println(section.text());
}
}
}
2)使用WordExtractor 读取
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.*;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.junit.Test;
import java.io.*;
import java.util.List;
public class WordTest {
@Test
public void readByExtractorTest() throws Exception {
InputStream is = new FileInputStream("C:\\Users\\阿劼\\Desktop\\11.doc");
WordExtractor extractor = new WordExtractor(is);
// word文档所有的文本
System.out.println("---------------文档中所有文本----------------");
System.out.println(extractor.getText());
// 页眉
System.out.println("-------------------页眉-----------------");
System.out.println(extractor.getHeaderText());
// 页脚
System.out.println("------------------页脚------------------");
System.out.println(extractor.getFooterText());
// 输出当前word文档的元数据信息,包括作者、文档的修改时间等。
System.out.println("------------------元数据信息-------------------");
System.out.println(extractor.getMetadataTextExtractor().getText());
// 获取各个段落的文本
System.out.println("=======================每个段落信息=========================");
String paraTexts[] = extractor.getParagraphText();
for (int i = 0; i < paraTexts.length; i++) {
System.out.println("------------------段落" + (i + 1) + "----------------");
System.out.println("Paragraph " + (i + 1) + " : " + paraTexts[i]);
}
// 当前word的一些信息
printInfo(extractor.getSummaryInformation());
// 当前word的一些信息
this.printInfo(extractor.getDocSummaryInformation());
this.closeStream(is);
}
/**
* 输出SummaryInfomation
*
* @param info
*/
private void printInfo(SummaryInformation info) {
System.out.println("===================从getSummaryInformation中获取信息===============");
// 作者
System.out.println("---------------------作者----------------------");
System.out.println(info.getAuthor());
// 字符统计
System.out.println("---------------------字符----------------------");
System.out.println(info.getCharCount());
// 页数
System.out.println("---------------------页数----------------------");
System.out.println(info.getPageCount());
// 标题
System.out.println("---------------------标题----------------------");
System.out.println(info.getTitle());
// 主题
System.out.println("---------------------主题----------------------");
System.out.println(info.getSubject());
}
/**
* 输出DocumentSummaryInfomation
*
* @param info
*/
private void printInfo(DocumentSummaryInformation info) {
System.out.println("===================从getDocSummaryInformation中获取信息===============");
// 分类
System.out.println("---------------------分类----------------------");
System.out.println(info.getCategory());
// 公司
System.out.println("---------------------公司----------------------");
System.out.println(info.getCompany());
}
}
4.读取docx型文件
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.*;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.junit.Test;
import java.io.*;
import java.util.List;
public class WordTest {
@Test
public void poiReadDocxTest() {
File file = new File("C:\\Users\\阿劼\\Desktop\\0.docx");
try {
FileInputStream fis = new FileInputStream(file);
XWPFDocument xdoc = new XWPFDocument(fis);
XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
String doc1 = extractor.getText();
System.out.println(doc1);
List allPictures = xdoc.getAllPictures();
fis.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
感谢https://www.jb51.net/article/101910.htm
和https://www.cnblogs.com/Renyi-Fan/p/8147650.html两篇文章