目前处理Excel的开源javaAPI主要有两种,一是Jxl(Java Excel API),Jxl只支持Excel2003以下的版本。另外一种是Apache的Jakarta
POI,相比于Jxl,POI对微软办公文档的支持更加强大,但是它使用复杂,上手慢。POI可支持更高的Excel版本2007。对Excel的读取,POI有两种模式,
一是用户模式,这种方式同Jxl的使用很类似,使用简单,都是将文件一次性读到内存,文件小的时候,没有什么问题,当文件大的时候,就会出现
OutOfMemory的内存溢出问题。第二种是事件驱动模式,拿Excel2007来说,其内容采用XML的格式来存储,所以处理excel就是解析XML,而目前使用事
件驱动模式解析XML的API是SAX(Simple API for XML),这种模型在读取XML文档时,并没有将整个文档读入内存,而是按顺序将整个文档解析完,在
解析过程中,会主动产生事件交给程序中相应的处理函数来处理当前内容。因此这种方式对系统资源要求不高,可以处理海量数据。笔者曾经做过测
试,这种方法处理一千万条,每条五列的数据花费大约11分钟。可见处理海量数据的文件事件驱动是一个很好的方式。而本文中用到的
AbstractExcel2003Reader、AbstractExcel2007Reader对Excel的读取都是采用这种POI的事件驱动模式。至于Excel的写操作,对较高版本的
Excel2007,POI提供了很好的支持,主要流程是第一步构建工作薄和电子表格对象,第二步在一个流中构建文本文件,第三步使用流中产生的数据替
换模板中的电子表格。这种方式也可以处理海量数据文件。AbstractExcel2007Writer就是使用这种方式进行写操作。对于写入较低版本的Excel2003
,POI使用了用户模式来处理,就是将整个文档加载进内存,如果数据量大的话就会出现内存溢出的问题,Excel2003Writer就是使用这种方式。据笔
者的测试,如果数据量大于3万条,每条8列的话,就会报OutOfMemory的错误。Excel2003中每个电子表格的记录数必须在65536以下,否则就会发生异
常。目前还没有好的解决方案,建议对于海量数据写入操作,尽量使用Excel2007。
[java] view plaincopy
/**
* 抽象Excel2003读取器,通过实现HSSFListener监听器,采用事件驱动模式解析excel2003
* 中的内容,遇到特定事件才会触发,大大减少了内存的使用。
*
*/
public class Excel2003Reader implements HSSFListener{
private int minColumns = -1;
private POIFSFileSystem fs;
private int lastRowNumber;
private int lastColumnNumber;
/** Should we output the formula, or the value it has? */
private boolean outputFormulaValues = true;
/** For parsing Formulas */
private SheetRecordCollectingListener workbookBuildingListener;
//excel2003工作薄
private HSSFWorkbook stubWorkbook;
// Records we pick up as we process
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
//表索引
private int sheetIndex = -1;
private BoundSheetRecord[] orderedBSRs;
@SuppressWarnings("unchecked")
private ArrayList boundSheetRecords = new ArrayList();
// For handling formulas with string results
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
//当前行
private int curRow = 0;
//存储行记录的容器
private List<String> rowlist = new ArrayList<String>();;
@SuppressWarnings( "unused")
private String sheetName;
private IRowReader rowReader;
public void setRowReader(IRowReader rowReader){
this.rowReader = rowReader;
}
/**
* 遍历excel下所有的sheet
* @throws IOException
*/
public void process(String fileName) throws IOException {
this.fs = new POIFSFileSystem(new FileInputStream(fileName));
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(
this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
if (outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new SheetRecordCollectingListener(
formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
factory.processWorkbookEvents(request, fs);
}
/**
* HSSFListener 监听方法,处理 Record
*/
@SuppressWarnings("unchecked")
public void processRecord(Record record) {
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
String value = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
boundSheetRecords.add(record);
break;
case BOFRecord.sid:
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
// 如果有需要,则建立子工作薄
if (workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener
.getStubHSSFWorkbook();
}
sheetIndex++;
if (orderedBSRs == null) {
orderedBSRs = BoundSheetRecord
.orderByBofPosition(boundSheetRecords);
}
sheetName = orderedBSRs[sheetIndex].getSheetname();
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid:
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
thisStr = "";
rowlist.add(thisColumn, thisStr);
break;
case BoolErrRecord.sid: //单元格为布尔类型
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = berec.getBooleanValue()+"";
rowlist.add(thisColumn, thisStr);
break;
case FormulaRecord.sid: //单元格为公式类型
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
// Formula result is a string
// This is stored in the next record
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = formatListener.formatNumberDateCell(frec);
}
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook,
frec.getParsedExpression()) + '"';
}
rowlist.add(thisColumn,thisStr);
break;
case StringRecord.sid://单元格中公式的字符串
if (outputNextStringRecord) {
// String for formula
StringRecord srec = (StringRecord) record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
curRow = thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
value = lrec.getValue().trim();
value = value.equals("")?" ":value;
this.rowlist.add(thisColumn, value);
break;
case LabelSSTRecord.sid: //单元格为字符串类型
LabelSSTRecord lsrec = (LabelSSTRecord) record;
curRow = thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (sstRecord == null) {
rowlist.add(thisColumn, " ");
} else {
value = sstRecord
.getString(lsrec.getSSTIndex()).toString().trim();
value = value.equals("")?" ":value;
rowlist.add(thisColumn,value);
}
break;
case NumberRecord.sid: //单元格为数字类型
NumberRecord numrec = (NumberRecord) record;
curRow = thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
value = formatListener.formatNumberDateCell(numrec).trim();
value = value.equals("")?" ":value;
// 向容器加入列值
rowlist.add(thisColumn, value);
break;
default:
break;
}
// 遇到新行的操作
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
// 空值的操作
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
curRow = thisRow = mc.getRow();
thisColumn = mc.getColumn();
rowlist.add(thisColumn," ");
}
// 更新行和列的值
if (thisRow > -1)
lastRowNumber = thisRow;
if (thisColumn > -1)
lastColumnNumber = thisColumn;
// 行结束时的操作
if (record instanceof LastCellOfRowDummyRecord) {
if (minColumns > 0) {
// 列值重新置空
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
}
lastColumnNumber = -1;
// 每行结束时, 调用getRows() 方法
rowReader.getRows(sheetIndex,curRow, rowlist);
// 清空容器
rowlist.clear();
}
}
}
[java] view plaincopy
/**
* 抽象Excel2007读取器,excel2007的底层数据结构是xml文件,采用SAX的事件驱动的方法解析
* xml,需要继承DefaultHandler,在遇到文件内容时,事件会触发,这种做法可以大大降低
* 内存的耗费,特别使用于大数据量的文件。
*
*/
public class Excel2007Reader extends DefaultHandler {
//共享字符串表
private SharedStringsTable sst;
//上一次的内容
private String lastContents;
private boolean nextIsString;
private int sheetIndex = -1;
private List<String> rowlist = new ArrayList<String>();
//当前行
private int curRow = 0;
//当前列
private int curCol = 0;
//日期标志
private boolean dateFlag;
//数字标志
private boolean numberFlag;
private boolean isTElement;
private IRowReader rowReader;
public void setRowReader(IRowReader rowReader){
this.rowReader = rowReader;
}
/**只遍历一个电子表格,其中sheetId为要遍历的sheet索引,从1开始,1-3
* @param filename
* @param sheetId
* @throws Exception
*/
public void processOneSheet(String filename,int sheetId) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader(pkg);
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
// 根据 rId# 或 rSheet# 查找sheet
InputStream sheet2 = r.getSheet("rId"+sheetId);
sheetIndex++;
InputSource sheetSource = new InputSource(sheet2);
parser.parse(sheetSource);
sheet2.close();
}
/**
* 遍历工作簿中所有的电子表格
* @param filename
* @throws Exception
*/
public void process(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader(pkg);
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
Iterator<InputStream> sheets = r.getSheetsData();
while (sheets.hasNext()) {
curRow = 0;
sheetIndex++;
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
}
public XMLReader fetchSheetParser(SharedStringsTable sst)
throws SAXException {
XMLReader parser = XMLReaderFactory
.createXMLReader("org.apache.xerces.parsers.SAXParser");
this.sst = sst;
parser.setContentHandler(this);
return parser;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => 单元格
if ("c".equals(name)) {
// 如果下一个元素是 SST 的索引,则将nextIsString标记为true
String cellType = attributes.getValue("t");
if ("s".equals(cellType)) {
nextIsString = true;
} else {
nextIsString = false;
}
//日期格式
String cellDateType = attributes.getValue("s");
if ("1".equals(cellDateType)){
dateFlag = true;
} else {
dateFlag = false;
}
String cellNumberType = attributes.getValue("s");
if("2".equals(cellNumberType)){
numberFlag = true;
} else {
numberFlag = false;
}
}
//当元素为t时
if("t".equals(name)){
isTElement = true;
} else {
isTElement = false;
}
// 置空
lastContents = "";
}
public void endElement(String uri, String localName, String name)
throws SAXException {
// 根据SST的索引值的到单元格的真正要存储的字符串
// 这时characters()方法可能会被调用多次
if (nextIsString) {
try {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx))
.toString();
} catch (Exception e) {
}
}
//t元素也包含字符串
if(isTElement){
String value = lastContents.trim();
rowlist.add(curCol, value);
curCol++;
isTElement = false;
// v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
// 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
} else if ("v".equals(name)) {
String value = lastContents.trim();
value = value.equals("")?" ":value;
//日期格式处理
if(dateFlag){
Date date = HSSFDateUtil.getJavaDate(Double.valueOf(value));
SimpleDateFormat dateFormat = new SimpleDateFormat(
"dd/MM/yyyy");
value = dateFormat.format(date);
}
//数字类型处理
if(numberFlag){
BigDecimal bd = new BigDecimal(value);
value = bd.setScale(3,BigDecimal.ROUND_UP).toString();
}
rowlist.add(curCol, value);
curCol++;
}else {
//如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
if (name.equals("row")) {
rowReader.getRows(sheetIndex,curRow,rowlist);
rowlist.clear();
curRow++;
curCol = 0;
}
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
//得到单元格内容的值
lastContents += new String(ch, start, length);
}
}
[java] view plaincopy
public class ExcelReaderUtil {
//excel2003扩展名
public static final String EXCEL03_EXTENSION = ".xls";
//excel2007扩展名
public static final String EXCEL07_EXTENSION = ".xlsx";
/**
* 读取Excel文件,可能是03也可能是07版本
* @param excel03
* @param excel07
* @param fileName
* @throws Exception
*/
public static void readExcel(IRowReader reader,String fileName) throws Exception{
// 处理excel2003文件
if (fileName.endsWith(EXCEL03_EXTENSION)){
Excel2003Reader excel03 = new Excel2003Reader();
excel03.setRowReader(reader);
excel03.process(fileName);
// 处理excel2007文件
} else if (fileName.endsWith(EXCEL07_EXTENSION)){
Excel2007Reader excel07 = new Excel2007Reader();
excel07.setRowReader(reader);
excel07.process(fileName);
} else {
throw new Exception("文件格式错误,fileName的扩展名只能是xls或xlsx。");
}
}
}
[java] view plaincopy
public interface IRowReader {
/**业务逻辑实现方法
* @param sheetIndex
* @param curRow
* @param rowlist
*/
public void getRows(int sheetIndex,int curRow, List<String> rowlist);
}
[java] view plaincopy
public class RowReader implements IRowReader{
/* 业务逻辑实现方法
* @see com.eprosun.util.excel.IRowReader#getRows(int, int, java.util.List)
*/
public void getRows(int sheetIndex, int curRow, List<String> rowlist) {
// TODO Auto-generated method stub
System.out.print(curRow+" ");
for (int i = 0; i < rowlist.size(); i++) {
System.out.print(rowlist.get(i) + " ");
}
System.out.println();
}
}
[java] view plaincopy
public class Main {
public static void main(String[] args) throws Exception {
IRowReader reader = new RowReader();
//ExcelReaderUtil.readExcel(reader, "F://te03.xls");
ExcelReaderUtil.readExcel(reader, "F://test07.xlsx");
}
}
[java] view plaincopy
public class Excel2003Writer {
/**
* @param args
*/
public static void main(String[] args) {
try{
System.out.println("开始写入excel2003....");
writeExcel("tes2003.xls");
System.out.println("写完xcel2003");
} catch (IOException e) {
}
}
/**
* 写入excel并填充内容,一个sheet只能写65536行以下,超出会报异常,写入时建议使用AbstractExcel2007Writer
* @param fileName
* @throws IOException
*/
public static void writeExcel(String fileName) throws IOException{
// 创建excel2003对象
Workbook wb = new HSSFWorkbook();
// 设置文件放置路径和文件名
FileOutputStream fileOut = new FileOutputStream(fileName);
// 创建新的表单
Sheet sheet = wb.createSheet("newsheet");
// 创建新行
for(int i=0;i<20000;i++){
Row row = sheet.createRow(i);
// 创建单元格
Cell cell = row.createCell(0);
// 设置单元格值
cell.setCellValue(1);
row.createCell(1).setCellValue(1+i);
row.createCell(2).setCellValue(true);
row.createCell(3).setCellValue(0.43d);
row.createCell(4).setCellValue('d');
row.createCell(5).setCellValue("");
row.createCell(6).setCellValue("第七列"+i);
row.createCell(7).setCellValue("第八列"+i);
}
wb.write(fileOut);
fileOut.close();
}
}
[java] view plaincopy
/**
* 抽象excel2007读入器,先构建.xlsx一张模板,改写模板中的sheet.xml,使用这种方法
* 写入.xlsx文件,不需要太大的内存
*
*/
public abstract class AbstractExcel2007Writer {
private SpreadsheetWriter sw;
/**
* 写入电子表格的主要流程
* @param fileName
* @throws Exception
*/
public void process(String fileName) throws Exception{
// 建立工作簿和电子表格对象
XSSFWorkbook wb = new XSSFWorkbook();
XSSFSheet sheet = wb.createSheet("sheet1");
// 持有电子表格数据的xml文件名 例如 /xl/worksheets/sheet1.xml
String sheetRef = sheet.getPackagePart().getPartName().getName();
// 保存模板
FileOutputStream os = new FileOutputStream("template.xlsx");
wb.write(os);
os.close();
// 生成xml文件
File tmp = File.createTempFile("sheet", ".xml");
Writer fw = new FileWriter(tmp);
sw = new SpreadsheetWriter(fw);
generate();
fw.close();
// 使用产生的数据替换模板
File templateFile = new File("template.xlsx");
FileOutputStream out = new FileOutputStream(fileName);
substitute(templateFile, tmp, sheetRef.substring(1), out);
out.close();
//删除文件之前调用一下垃圾回收器,否则无法删除模板文件
System.gc();
// 删除临时模板文件
if (templateFile.isFile()&&templateFile.exists()){
templateFile.delete();
}
}
/**
* 类使用者应该使用此方法进行写操作
* @throws Exception
*/
public abstract void generate() throws Exception;
public void beginSheet() throws IOException {
sw.beginSheet();
}
public void insertRow(int rowNum) throws IOException {
sw.insertRow(rowNum);
}
public void createCell(int columnIndex, String value) throws IOException {
sw.createCell(columnIndex, value, -1);
}
public void createCell(int columnIndex, double value) throws IOException {
sw.createCell(columnIndex, value, -1);
}
public void endRow() throws IOException {
sw.endRow();
}
public void endSheet() throws IOException {
sw.endSheet();
}
/**
*
* @param zipfile the template file
* @param tmpfile the XML file with the sheet data
* @param entry the name of the sheet entry to substitute, e.g. xl/worksheets/sheet1.xml
* @param out the stream to write the result to
*/
private static void substitute(File zipfile, File tmpfile, String entry,
OutputStream out) throws IOException {
ZipFile zip = new ZipFile(zipfile);
ZipOutputStream zos = new ZipOutputStream(out);
@SuppressWarnings("unchecked")
Enumeration<ZipEntry> en = (Enumeration<ZipEntry>) zip.entries();
while (en.hasMoreElements()) {
ZipEntry ze = en.nextElement();
if (!ze.getName().equals(entry)) {
zos.putNextEntry(new ZipEntry(ze.getName()));
InputStream is = zip.getInputStream(ze);
copyStream(is, zos);
is.close();
}
}
zos.putNextEntry(new ZipEntry(entry));
InputStream is = new FileInputStream(tmpfile);
copyStream(is, zos);
is.close();
zos.close();
}
private static void copyStream(InputStream in, OutputStream out)
throws IOException {
byte[] chunk = new byte[1024];
int count;
while ((count = in.read(chunk)) >= 0) {
out.write(chunk, 0, count);
}
}
/**
* 在写入器中写入电子表格
*
*/
public static class SpreadsheetWriter {
private final Writer _out;
private int _rownum;
private static String LINE_SEPARATOR = System.getProperty("line.separator");
public SpreadsheetWriter(Writer out) {
_out = out;
}
public void beginSheet() throws IOException {
_out.write("<?xml version=\"1.0\" encoding=\"GB2312\"?>"
+ "<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">");
_out.write("<sheetData>"+LINE_SEPARATOR);
}
public void endSheet() throws IOException {
_out.write("</sheetData>");
_out.write("</worksheet>");
}
/**
* 插入新行
*
* @param rownum 以0开始
*/
public void insertRow(int rownum) throws IOException {
_out.write("<row r=\"" + (rownum + 1) + "\">"+LINE_SEPARATOR);
this._rownum = rownum;
}
/**
* 插入行结束标志
*/
public void endRow() throws IOException {
_out.write("</row>"+LINE_SEPARATOR);
}
/**
* 插入新列
* @param columnIndex
* @param value
* @param styleIndex
* @throws IOException
*/
public void createCell(int columnIndex, String value, int styleIndex)
throws IOException {
String ref = new CellReference(_rownum, columnIndex)
.formatAsString();
_out.write("<c r=\"" + ref + "\" t=\"inlineStr\"");
if (styleIndex != -1)
_out.write(" s=\"" + styleIndex + "\"");
_out.write(">");
_out.write("<is><t>"+XMLEncoder.encode(value)+"</t></is>");
_out.write("</c>");
}
public void createCell(int columnIndex, String value)
throws IOException {
createCell(columnIndex, value, -1);
}
public void createCell(int columnIndex, double value, int styleIndex)
throws IOException {
String ref = new CellReference(_rownum, columnIndex)
.formatAsString();
_out.write("<c r=\"" + ref + "\" t=\"n\"");
if (styleIndex != -1)
_out.write(" s=\"" + styleIndex + "\"");
_out.write(">");
_out.write("<v>" + value + "</v>");
_out.write("</c>");
}
public void createCell(int columnIndex, double value)
throws IOException {
createCell(columnIndex, value, -1);
}
public void createCell(int columnIndex, Calendar value, int styleIndex)
throws IOException {
createCell(columnIndex, DateUtil.getExcelDate(value, false),
styleIndex);
}
}
}
[java] view plaincopy
public class Excel2007WriterImpl extends AbstractExcel2007Writer{
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
System.out.println("............................");
long start = System.currentTimeMillis();
//构建excel2007写入器
AbstractExcel2007Writer excel07Writer = new Excel2007WriterImpl();
//调用处理方法
excel07Writer.process("F://test07.xlsx");
long end = System.currentTimeMillis();
System.out.println("....................."+(end-start)/1000);
}
/*
* 可根据需求重写此方法,对于单元格的小数或者日期格式,会出现精度问题或者日期格式转化问题,建议使用字符串插入方法
* @see com.excel.ver2.AbstractExcel2007Writer#generate()
*/
@Override
public void generate()throws Exception {
//电子表格开始
beginSheet();
for (int rownum = 0; rownum < 100; rownum++) {
//插入新行
insertRow(rownum);
//建立新单元格,索引值从0开始,表示第一列
createCell(0, "中国<" + rownum + "!");
createCell(1, 34343.123456789);
createCell(2, "23.67%");
createCell(3, "12:12:23");
createCell(4, "2010-10-11 12:12:23");
createCell(5, "true");
createCell(6, "false");
//结束行
endRow();
}
//电子表格结束
endSheet();
}
}
[java] view plaincopy
public class XMLEncoder {
private static final String[] xmlCode = new String[256];
static {
// Special characters
xmlCode['\''] = "'";
xmlCode['\"'] = "\""; // double quote
xmlCode['&'] = "&"; // ampersand
xmlCode['<'] = "<"; // lower than
xmlCode['>'] = ">"; // greater than
}
/**
* <p>
* Encode the given text into xml.
* </p>
*
* @param string the text to encode
* @return the encoded string
*/
public static String encode(String string) {
if (string == null) return "";
int n = string.length();
char character;
String xmlchar;
StringBuffer buffer = new StringBuffer();
// loop over all the characters of the String.
for (int i = 0; i < n; i++) {
character = string.charAt(i);
// the xmlcode of these characters are added to a StringBuffer one by one
try {
xmlchar = xmlCode[character];
if (xmlchar == null) {
buffer.append(character);
} else {
buffer.append(xmlCode[character]);
}
} catch (ArrayIndexOutOfBoundsException aioobe) {
buffer.append(character);
}
}
return buffer.toString();
}
}