POI 关于对 ms word的读写代码

关键字: poi

read word:
public class WordExtractor {
	public WordExtractor() {

	public String extractText(InputStream in) throws IOException {
		ArrayList text = new ArrayList();
		POIFSFileSystem fsys = new POIFSFileSystem(in);

		DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
		DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
		byte[] header = new byte[headerProps.getSize()];

		// Prende le informazioni dall'header del documento
		int info = LittleEndian.getShort(header, 0xa);

		boolean useTable1 = (info & 0x200) != 0;

		//boolean useTable1 = true;
		// Prende informazioni dalla piece table
		int complexOffset = LittleEndian.getInt(header, 0x1a2);
		//int complexOffset = LittleEndian.getInt(header);
		String tableName = null;
		if (useTable1) {
			tableName = "1Table";
		} else {
			tableName = "0Table";

		DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
		byte[] tableStream = new byte[table.getSize()];

		din = fsys.createDocumentInputStream(tableName);


		din = null;
		fsys = null;
		table = null;
		headerProps = null;

		int multiple = findText(tableStream, complexOffset, text);

		StringBuffer sb = new StringBuffer();
		int size = text.size();
		tableStream = null;

		for (int x = 0; x < size; x++) {
			WordTextPiece nextPiece = (WordTextPiece) text.get(x);
			int start = nextPiece.getStart();
			int length = nextPiece.getLength();

			boolean unicode = nextPiece.usesUnicode();
			String toStr = null;
			if (unicode) {
				toStr = new String(header, start, length * multiple, "UTF-16LE");
			} else {
				toStr = new String(header, start, length, "ISO-8859-1");
			sb.append(toStr).append(" ");

		return sb.toString();

	private static int findText(byte[] tableStream, int complexOffset, ArrayList text)
		throws IOException {
		//actual text
		int pos = complexOffset;
		int multiple = 2;
		//skips through the prms before we reach the piece table. These contain	data
		//for actual fast saved files
		while (tableStream[pos] == 1) {
			int skip = LittleEndian.getShort(tableStream, pos);
			pos += 2 + skip;
		if (tableStream[pos] != 2) {
			throw new IOException("corrupted Word file");
		} else {
			//parse out the text pieces
			int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
			pos += 4;
			int pieces = (pieceTableSize - 4) / 12;
			for (int x = 0; x < pieces; x++) {
				int filePos =
					LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) + (x * 8) + 2);
				boolean unicode = false;
				if ((filePos & 0x40000000) == 0) {
					unicode = true;
				} else {
					unicode = false;
					multiple = 1;
					filePos &= ~(0x40000000); //gives me FC in doc stream
					filePos /= 2;
				int totLength =
					LittleEndian.getInt(tableStream, pos + (x + 1) * 4)
						- LittleEndian.getInt(tableStream, pos + (x * 4));

				WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);


		return multiple;
	public static void main(String[] args){
		WordExtractor w  = new WordExtractor();
		POIFSFileSystem ps = new POIFSFileSystem();
			File file = new File("C:\\test.doc");
			InputStream in = new FileInputStream(file);
			String s = w.extractText(in);
		}catch(Exception e){

class WordTextPiece {
	private int _fcStart;
	private boolean _usesUnicode;
	private int _length;

	public WordTextPiece(int start, int length, boolean unicode) {
		_usesUnicode = unicode;
		_length = length;
		_fcStart = start;
	public boolean usesUnicode() {
		return _usesUnicode;

	public int getStart() {
		return _fcStart;
	public int getLength() {
		return _length;


write word

	public boolean writeWordFile(String path, String content) {
		boolean w = false;
		try {
		//	byte b[] = content.getBytes("ISO-8859-1");
			byte b[] = content.getBytes();
			ByteArrayInputStream bais = new ByteArrayInputStream(b);

			POIFSFileSystem fs = new POIFSFileSystem();
			DirectoryEntry directory = fs.getRoot();

			DocumentEntry de = directory.createDocument("WordDocument", bais);

			FileOutputStream ostream = new FileOutputStream(path);


		} catch (IOException e) {

		return w;

写操作的代码还是有些问题:打开 WORD时提示要选择字符类型

开发 poi word 的那个哥们老早就不干了,好像就职于商业的公司,apache还招募人参与呢,2006的事, poi处理 word 太弱,还得用vs6.0 c++, java的开源的那个 word有更详细的文档,不过没研究过,处理excel还算凑合,
问个问题,写work java哪个开源包比较好
我看 POIword支持不太够啊
http://jakarta.apache.org/ poi/index.html
感谢楼主,这个 word extractor能够比较好的支持中文。
原先我是使用nutch的 word文本提取,但是相当大部分的中文 word文档无法正确提取,到官方网站查看他们的解决方案,是这么说的“Document with 2-byte characters (that's how Chinese characters are probably stored) are not correctly handled by HWPF.”One more thing you need to consider: HWPF cannot handle "fast saved" Word files. If the documents you need to parse are "fast saved" this adds an extra level of complexity.

希望楼主把表情符号关掉 重新编辑一下 让大家能欣赏到正确的代码
