使用Java的POI进行Word文档的解析并生成XML格式文档

    如下代码可以实现使用Java的POI进行Word文档的解析并生成XML格式文档功能,此代码编译通过,但是运行有问题,读者可以亲自试试并能否改bug:


import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;

public final class Word2Forrest {
	Writer _out;
	HWPFDocument _doc;

	@SuppressWarnings("unused")
	public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException {
		OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8"));
		_out = out;
		_doc = doc;

		init();
		openDocument();
		openBody();

		Range r = doc.getRange();
		StyleSheet styleSheet = doc.getStyleSheet();

		int sectionLevel = 0;
		int lenParagraph = r.numParagraphs();
		boolean inCode = false;
		for (int x = 0; x < lenParagraph; x++) {
			Paragraph p = r.getParagraph(x);

			String text = p.text();
			if (text.trim().length() == 0) {
				continue;
			}
			StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());
			String styleName = paragraphStyle.getName();
			if (styleName.startsWith("Heading")) {
				if (inCode) {
					closeSource();
					inCode = false;
				}

				int headerLevel = Integer.parseInt(styleName.substring(8));
				if (headerLevel > sectionLevel) {
					openSection();
				} else {
					for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
						closeSection();
					}
					openSection();
				}
				sectionLevel = headerLevel;
				openTitle();
				System.out.println("++++++" + p.text());
				writePlainText(text);
				closeTitle();
			} else {
				int cruns = p.numCharacterRuns();
				CharacterRun run = p.getCharacterRun(0);
				String fontName = run.getFontName();
				if (fontName.startsWith("Courier")) {
					if (!inCode) {
						openSource();
						inCode = true;
					}
					System.out.println("------" + p.text());
					writePlainText(p.text());
				} else {
					if (inCode) {
						inCode = false;
						closeSource();
					}
					openParagraph();
					System.out.println("******" + p.text());
					writePlainText(p.text());
					closeParagraph();
				}
			}
		}
		for (int x = 0; x < sectionLevel; x++) {
			closeSection();
		}
		closeBody();
		closeDocument();
		_out.flush();

	}

	public void init() throws IOException {
		_out.write("\r\n");
		_out.write(
				"\r\n");
	}

	public void openDocument() throws IOException {
		_out.write("\r\n");
	}

	public void closeDocument() throws IOException {
		_out.write("\r\n");
	}

	public void openBody() throws IOException {
		_out.write("\r\n");
	}

	public void closeBody() throws IOException {
		_out.write("\r\n");
	}

	public void openSection() throws IOException {
		_out.write("
"); } public void closeSection() throws IOException { _out.write("
"); } public void openTitle() throws IOException { _out.write(""); } public void closeTitle() throws IOException { _out.write(""); } public void writePlainText(String text) throws IOException { _out.write(text); } public void openParagraph() throws IOException { _out.write("

"); } public void closeParagraph() throws IOException { _out.write("

"); } public void openSource() throws IOException { _out.write(""); } public static void main(String[] args) throws IOException { InputStream is = new FileInputStream("D:/QMDownload/hwpftest.doc"); OutputStream out = new FileOutputStream("D:/QMDownload/test.xml"); try { new Word2Forrest(new HWPFDocument(is), out); } finally { out.close(); is.close(); } } }


你可能感兴趣的:(Java技术总结,程序路上辨辨辨,软件架构及技术纲要)