读取大数据量excel

package excel;


import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;


import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;


public class Excel2007Reader extends DefaultHandler {
	/**
	* 共享字符串表
	*/
	private SharedStringsTable sst;


	/**
	* 上一次的内容
	*/
	private String lastContents;


	/**
	* 字符串标识
	*/
	private boolean nextIsString;


	/**
	* 工作表索引
	*/
	private int sheetIndex = -1;


	/**
	* 行集合
	*/
	private List<String> rowlist = new ArrayList<String>();


	/**
	* 当前行
	*/
	private int curRow = 0;


	/**
	* 当前列
	*/
	private int curCol = 0;


	/**
	* T元素标识
	*/
	private boolean isTElement;


	/**
	* 异常信息,如果为空则表示没有异常
	*/
	private String exceptionMessage;


	/**
	* 单元格数据类型,默认为字符串类型
	*/
	private CellDataType nextDataType = CellDataType.SSTINDEX;


	private final DataFormatter formatter = new DataFormatter();


	private short formatIndex;


	private String formatString;
	/** 封装第一行的标题信息,防止空单元格 */
	private List<String> title = new ArrayList<String>();


	/**
	* 单元格
	*/
	private StylesTable stylesTable;


	/**
	* 取第一个sheet里内容
	* 
	* @param filename
	* @param sheetId
	* @throws Exception
	*/
	public void processOneSheet(String filename, int sheetId) throws Exception {
		OPCPackage pkg = OPCPackage.open(filename);
		XSSFReader xssfReader = new XSSFReader(pkg);
		stylesTable = xssfReader.getStylesTable();
		SharedStringsTable sst = xssfReader.getSharedStringsTable();
		XMLReader parser = this.fetchSheetParser(sst);
		InputStream sheet = xssfReader.getSheet("rId" + sheetId);
		sheetIndex++;
		InputSource sheetSource = new InputSource(sheet);
		parser.parse(sheetSource);
		sheet.close();
	}


	/**
	* 遍历工作簿中所有的电子表格
	* 
	* @param filename
	* @throws IOException
	* @throws OpenXML4JException
	* @throws SAXException
	* @throws Exception
	*/
	public void process(String filename) throws IOException,OpenXML4JException, SAXException {
		OPCPackage pkg = OPCPackage.open(filename);
		XSSFReader xssfReader = new XSSFReader(pkg);
		stylesTable = xssfReader.getStylesTable();
		SharedStringsTable sst = xssfReader.getSharedStringsTable();
		XMLReader parser = this.fetchSheetParser(sst);
		Iterator<InputStream> sheets = xssfReader.getSheetsData();
		while (sheets.hasNext()) {
			curRow = 0;
			sheetIndex++;
			InputStream sheet = sheets.next();
			InputSource sheetSource = new InputSource(sheet);
			parser.parse(sheetSource);
			sheet.close();
		}
	}


	public XMLReader fetchSheetParser(SharedStringsTable sst)
			throws SAXException {
		XMLReader parser = XMLReaderFactory.createXMLReader();
		// XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
		this.sst = sst;
		parser.setContentHandler(this);
		return parser;
	}


	/**
	* 模板中有多少列就设置多少列
	* 
	* @author Administrator
	* 
	*/
	enum TitleItem {
		A("A"), B("B"), C("C"), D("D"), E("E"), F("F"), G("G"), H("H"), I("I"), J("J");
		private String key;


		TitleItem(String key) {
			this.key = key;
		}


		public static TitleItem getObj(String key) {
			for (TitleItem obj : values()) {
				if (obj.getKey().equals(key)) {
					return obj;
				}
			}
			return null;
		}


		public String getKey() {
			return key;
		}


		public void setKey(String key) {
			this.key = key;
		}


	}


	@Override
	public void startElement(String uri, String localName, String name,	Attributes attributes) throws SAXException {
		// 封装title
		String type = attributes.getValue("r");
		if (type != null && !type.equals("")) {
			type = type.substring(0, 1);
			if (TitleItem.getObj(type) != null) {
				title.add(type);
			}
		}


		// c => 单元格
		if ("c".equals(name)) {
			// 设定单元格类型
			this.setNextDataType(attributes);
		}


		// 当元素为t时
		if ("t".equals(name)) {
			isTElement = true;
		} else {
			isTElement = false;
		}
		// 置空
		lastContents = "";
	}


	/**
	* 单元格中的数据可能的数据类型
	*/
	enum CellDataType {
		BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
	}


	/**
	* 处理数据类型
	* 
	* @param attributes
	*/
	public void setNextDataType(Attributes attributes) {
		nextDataType = CellDataType.NUMBER;
		formatIndex = -1;
		formatString = null;
		String cellType = attributes.getValue("t");
		String cellStyleStr = attributes.getValue("s");


		if ("b".equals(cellType)) {
			nextDataType = CellDataType.BOOL;
		} else if ("e".equals(cellType)) {
			nextDataType = CellDataType.ERROR;
		} else if ("inlineStr".equals(cellType)) {
			nextDataType = CellDataType.INLINESTR;
		} else if ("s".equals(cellType)) {
			nextDataType = CellDataType.SSTINDEX;
		} else if ("str".equals(cellType)) {
			nextDataType = CellDataType.FORMULA;
		}


		if (cellStyleStr != null) {
			int styleIndex = Integer.parseInt(cellStyleStr);
			XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
			formatIndex = style.getDataFormat();
			formatString = style.getDataFormatString();


			if ("m/d/yy" == formatString) {
				nextDataType = CellDataType.DATE;
				formatString = "yyyy-MM-dd";
				// formatString = "yyyy-MM-dd hh:mm:ss.SSS";
			}


			if (formatString == null) {
				nextDataType = CellDataType.NULL;
				formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
			}
		}
	}


	/**
	* 对解析出来的数据进行类型处理
	* 
	* @param value
	*            单元格的值(这时候是一串数字)
	* @param thisStr
	*            一个空字符串
	* @return
	*/
	public String getDataValue(String value, String thisStr) {
		switch (nextDataType) {
		// 这几个的顺序不能随便交换,交换了很可能会导致数据错误
		case BOOL:
			char first = value.charAt(0);
			thisStr = first == '0' ? "FALSE" : "TRUE";
			break;
		case ERROR:
			thisStr = "\"ERROR:" + value.toString() + '"';
			break;
		case FORMULA:
			thisStr = '"' + value.toString() + '"';
			break;
		case INLINESTR:
			XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());


			thisStr = rtsi.toString();
			rtsi = null;
			break;
		case SSTINDEX:
			String sstIndex = value.toString();
			try {
				int idx = Integer.parseInt(sstIndex);
				XSSFRichTextString rtss = new XSSFRichTextString(sst.getEntryAt(idx));
				thisStr = rtss.toString();
				rtss = null;
			} catch (NumberFormatException ex) {
				thisStr = value.toString();
			}
			break;
		case NUMBER:
			if (formatString != null) {
				thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim();
			} else {
				thisStr = value;
			}


			thisStr = thisStr.replace("_", "").trim();
			break;
		case DATE:
			thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString);


			// 对日期字符串作特殊处理
			thisStr = thisStr.replace(" ", "T");
			break;
		default:
			thisStr = " ";


			break;
		}


		return thisStr;
	}


	@Override
	public void endElement(String uri, String localName, String name) throws SAXException {


		// 根据SST的索引值的到单元格的真正要存储的字符串
		// 这时characters()方法可能会被调用多次
		if (nextIsString) {
			int idx = Integer.parseInt(lastContents);
			lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
		}


		// t元素也包含字符串
		if (isTElement) {
			// 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
			String value = lastContents.trim();
			rowlist.add(curCol, value);
			curCol++;
			isTElement = false;
		} else if ("v".equals(name)) {
			// v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
			String value = this.getDataValue(lastContents.trim(), "");


			rowlist.add(curCol, value);
			curCol++;
		} else {
			// 如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
			if (name.equals("row")) {
				try {
					// 一行信息结束时进行业务逻辑处理
					optRow(sheetIndex, curRow, rowlist, title);
					// 重置当前行标题信息
					title = new ArrayList<String>();
				} catch (Exception e) {
					e.printStackTrace();
				}
				rowlist.clear();
				curRow++;
				curCol = 0;
			}
		}
	}


	/**
	* 处理每一行数据,可以根据业务需要封装成业务实体(每一行都保证有全部单元格的内容,空单元格内容为“”)
	* 
	* @param sheetIndex
	* @param curRow
	* @param rowList
	* @param title
	*/
	public void optRow(int sheetIndex, int curRow, List<String> rowList,List<String> title) {
		System.out.println("第 " + (curRow + 1) + " 行数据如下:");
		List<String> list = changeList(rowList, title);
		for (String string : list) {
			System.out.println(string);
		}
		/*
		* System.out.println("---------------------------------------"); for
		* (String key : title) { System.out.println(key); }
		* System.out.println("第 "+(curRow+1)+" 行数据如下:"); for (int i = 0; i <
		* rowlist.size(); i++) { System.out.print("'" + rowlist.get(i) + "',");
		* } System.out.println("\n---------------------------------------");
		*/
	}


	/**
	* 将为空的单元内容读取出来(上面方法无法取到空单元格,所以人工处理一下)
	* 
	* @param rowList
	* @param title
	*/
	public List<String> changeList(List<String> rowList, List<String> title) {
		TitleItem[] titles = TitleItem.values();
		// 共有多少个字段
		int count = titles.length;
		// 实际有多少个字段
		int realCount = title.size();
		List<String> result = rowList;
		// 有空单元格
		if (count != realCount) {
			result = new ArrayList<String>();
			for (int i = 0; i < count; i++) {
				boolean flag = false;
				String value = "";
				for (int j = 0; j < realCount; j++) {
					// 和单元格标题匹配,不存在内容为“”
					flag = title.get(j).equals(titles[i].getKey()) ? true : false;
					if (flag) {
						value = rowlist.get(j);
						break;
					}
				}
				result.add(value);
			}
		}
		return result;
	}


	@Override
	public void characters(char[] ch, int start, int length) throws SAXException {
		// 得到单元格内容的值
		lastContents += new String(ch, start, length);
	}


	/**
	* @return the exceptionMessage
	*/
	public String getExceptionMessage() {
		return exceptionMessage;
	}
}

你可能感兴趣的:(Excel)