Excel大文件时读取内存溢出的解决方案


package com.sundy.parse.util;
 
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
 
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
 
public class ParseXlsxExcel extends DefaultHandler {
	private OPCPackage opcPackage = null;
	private XSSFReader xssfReader = null;
	private boolean nextIsStr;
	private String cellContent; //单元格的内容
	private ArrayList<String> sheetList; // sheet的所有内容
	private ArrayList<String> excelList; // Excel的所有内容
	private SharedStringsTable sst;
	private int currentColumn = 0;
	private XMLReader reader;
	
	/**
	 * @param path .xlxs后缀的Excel文件
	 * @param sheetId 
	 * @throws IOException
	 * @throws OpenXML4JException
	 * @throws SAXException
	 */
	public ParseXlsxExcel(String path,int sheetId) throws IOException, OpenXML4JException, SAXException{
		init(new File(path));
		parseSheet(sheetId);
	}
	
	public ParseXlsxExcel(File file,int sheetId) throws IOException, OpenXML4JException, SAXException{
		init(file);
		parseSheet(sheetId);
	}
	
	/**
	 * @param file	.xlxs 后缀的Excel文件
	 * @param flag	true 表示查询所有 ,false 表示只获取sheetId为1的内容
	 * @throws IOException
	 * @throws OpenXML4JException
	 * @throws SAXException
	 */
	public ParseXlsxExcel(File file,boolean flag) throws IOException, OpenXML4JException, SAXException{
		initAll(file, flag);
	}
	
	public ParseXlsxExcel(String path,boolean flag) throws IOException, OpenXML4JException, SAXException{
		initAll(new File(path), flag);
	}
 
	private void initAll(File file, boolean flag)throws InvalidFormatException, IOException, OpenXML4JException,SAXException {
		init(file);
		if(flag){
			parseAllSheet();
		}else {
			parseSheet(1);
		}
	}
	
	private void parseAllSheet() {
		int x = 0;
		int y = 0;
		while(x==y){
			try {
				parseSheet(++x);
				excelList.addAll(sheetList);
				y=x;
				currentColumn=0;
			} catch (Exception e) {
				// e.printStackTrace();
			}
		}
	}
 
	private void init(File file) throws IOException, OpenXML4JException, SAXException{
		opcPackage = OPCPackage.open((file), PackageAccess.READ);
		sheetList = new ArrayList<String>();
		excelList = new ArrayList<String>();
		xssfReader = new XSSFReader(opcPackage);
		sst = xssfReader.getSharedStringsTable();
		reader = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
		reader.setContentHandler(this);
	}
	
	public void close() throws IOException{
		if(opcPackage!=null) opcPackage.close();
	}
	
	private void parseSheet(int sheetId) throws IOException, OpenXML4JException, SAXException{
		sheetList.clear();
		InputStream inStream = xssfReader.getSheet("rId"+sheetId);
		InputSource inputSource = new InputSource(inStream);
		reader.parse(inputSource);
	}
 
	@Override
	public void startElement(String uri, String localName, String qName,Attributes attributes) throws SAXException {
		if(qName.equals("c")){ // c为单元格,v为值,row为行。
			String type = attributes.getValue("t");
			if(type!=null && type.equals("s")){
				nextIsStr = true;
			}else {
				nextIsStr = false;
			} 
		}
	}
	
	@Override
	public void characters(char[] ch, int start, int length) throws SAXException {
		cellContent = new String(ch, start, length);
	}
	
	@Override
	public void endElement(String uri, String localName, String qName) throws SAXException{
		if(nextIsStr&&cellContent!=""&&cellContent!=null){
			try {
				int idx = Integer.parseInt(cellContent);
				cellContent = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
			} catch (Exception e) {
//				 e.printStackTrace();
			}
		}
		if(qName.equals("v")){
			sheetList.add(currentColumn++, cellContent);
			cellContent="";
		}
		
	}
	
	/** sheet的所有内容 */
	public ArrayList<String> getSheetList() {
		return sheetList;
	}
	
	/** Excel的所有内容 */
	public ArrayList<String> getExcelList() {
		return excelList;
	}
	
	public static void main(String[] args) throws Exception{
		long start = System.currentTimeMillis();
		ParseXlsxExcel excel = new ParseXlsxExcel("C:/Users/admin/Desktop/大师傅家的感觉.xlsx",true);
		excel.close();
		ArrayList<String> list = excel.getExcelList();
		long end = System.currentTimeMillis();
		
		System.out.println(list.size());//19677984
		System.out.println(end-start); //16389ms
	}
}

package com.sundy.parse.util;
 
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
 
import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BlankRecord;
import org.apache.poi.hssf.record.BoolErrRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
/**
 * 用于解析大于5M的后缀为.xls的Excel文件,防止内存溢出
 * @author sundy
 * 参考:http://gaosheng08.iteye.com/blog/624604
 */
public class ParseXlsExcel implements HSSFListener{
    private int minColumns;  
    private POIFSFileSystem fs;  
  
    private int lastRowNumber;  
    private int lastColumnNumber;  
  
    /** Should we output the formula, or the value it has? */  
    private boolean outputFormulaValues = true;  
  
    /** For parsing Formulas */  
    private SheetRecordCollectingListener workbookBuildingListener;  
    private HSSFWorkbook stubWorkbook;  
  
    // Records we pick up as we process  
    private SSTRecord sstRecord;  
    private FormatTrackingHSSFListener formatListener;  
  
    /** So we known which sheet we're on */  
    private int sheetIndex = -1;  
    private BoundSheetRecord[] orderedBSRs;  
    @SuppressWarnings("unchecked")  
    private ArrayList boundSheetRecords = new ArrayList();  
  
    // For handling formulas with string results  
    private int nextRow;  
    private int nextColumn;  
    private boolean outputNextStringRecord;  
  
    private int curRow;  
    private List<String> rowlist;  
    @SuppressWarnings( "unused")  
    private String sheetName;  
    
	private  List<String> contents = new ArrayList<String>();
    public ParseXlsExcel(POIFSFileSystem fs) throws Exception {  
        this.fs = fs;  
        this.minColumns = -1;  
        this.curRow = 0;  
        this.rowlist = new ArrayList<String>();  
    }  
  
    public ParseXlsExcel(String filename) throws Exception {  
        this(new POIFSFileSystem(new FileInputStream(filename)));  
    }  
    
    //excel记录行操作方法,以sheet索引,行索引和行元素列表为参数,对sheet的一行元素进行操作,元素为String类型  
    public void optRows(int sheetIndex,int curRow, List<String> rowlist){
     	StringBuffer sb = new StringBuffer();
        for (int i = 0 ;i< rowlist.size();i++){  
            String str = rowlist.get(i);
           if(null!=str&&!"".equals(str)) sb.append(str.trim()+" ");
        }  
        contents.add(sb.toString());
    }
    /**读取到的Excel文档,返回文档的content*/  
    public List<String> getList() {
		return contents;
	}
    
    /** 
     * 遍历 excel 文件 
     */  
    public void process() throws IOException {  
        MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);  
        formatListener = new FormatTrackingHSSFListener(listener);  
        HSSFEventFactory factory = new HSSFEventFactory();  
        HSSFRequest request = new HSSFRequest();  
        if (outputFormulaValues) {  
            request.addListenerForAllRecords(formatListener);  
        } else {  
            workbookBuildingListener = new SheetRecordCollectingListener(formatListener);  
            request.addListenerForAllRecords(workbookBuildingListener);  
        }  
        factory.processWorkbookEvents(request, fs);  
    }  
      
    /** 
     * HSSFListener 监听方法,处理 Record 
     */  
    @SuppressWarnings("unchecked")  
    public void processRecord(Record record) {  
        int thisRow = -1;  
        int thisColumn = -1;  
        String thisStr = null;  
        String value = null;  
          
        switch (record.getSid()) {  
        case BoundSheetRecord.sid:  
            boundSheetRecords.add(record);  
            break;  
        case BOFRecord.sid:  
            BOFRecord br = (BOFRecord) record;  
            if (br.getType() == BOFRecord.TYPE_WORKSHEET) {  
                // Create sub workbook if required  
                if (workbookBuildingListener != null && stubWorkbook == null) {  
                    stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();  
                }  
                sheetIndex++;  
                if (orderedBSRs == null) {  
                    orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);  
                }  
                sheetName = orderedBSRs[sheetIndex].getSheetname();  
            }  
            break;  
  
        case SSTRecord.sid:  
            sstRecord = (SSTRecord) record;  
            break;  
  
        case BlankRecord.sid:  
            BlankRecord brec = (BlankRecord) record;  
  
            thisRow = brec.getRow();  
            thisColumn = brec.getColumn();  
            thisStr = "";  
            break;  
        case BoolErrRecord.sid:  
            BoolErrRecord berec = (BoolErrRecord) record;  
  
            thisRow = berec.getRow();  
            thisColumn = berec.getColumn();  
            thisStr = "";  
            break;  
  
        case FormulaRecord.sid:  
            FormulaRecord frec = (FormulaRecord) record;  
  
            thisRow = frec.getRow();  
            thisColumn = frec.getColumn();  
  
            if (outputFormulaValues) {  
                if (Double.isNaN(frec.getValue())) {  
                    outputNextStringRecord = true;  
                    nextRow = frec.getRow();  
                    nextColumn = frec.getColumn();  
                } else {  
                    thisStr = formatListener.formatNumberDateCell(frec);  
                }  
            } else {  
                thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';  
            }  
            break;  
        case StringRecord.sid:  
            if (outputNextStringRecord) {  
                // String for formula  
                StringRecord srec = (StringRecord) record;  
                thisStr = srec.getString();  
                thisRow = nextRow;  
                thisColumn = nextColumn;  
                outputNextStringRecord = false;  
            }  
            break;  
  
        case LabelRecord.sid:  
            LabelRecord lrec = (LabelRecord) record;  
  
            curRow = thisRow = lrec.getRow();  
            thisColumn = lrec.getColumn();  
            value = lrec.getValue().trim();  
            value = value.equals("")?"":value;  
//            this.rowlist.add(thisColumn, value);  
            rowlist.add(value);
            break;  
        case LabelSSTRecord.sid:  
            LabelSSTRecord lsrec = (LabelSSTRecord) record;  
  
            curRow = thisRow = lsrec.getRow();  
            thisColumn = lsrec.getColumn();  
            if (sstRecord != null){  
                value =  sstRecord  
                .getString(lsrec.getSSTIndex()).toString().trim();  
                value = value.equals("")?"":value;  
                rowlist.add(value);
            }  
            break;  
        case NoteRecord.sid:  
            NoteRecord nrec = (NoteRecord) record;  
  
            thisRow = nrec.getRow();  
            thisColumn = nrec.getColumn();  
            thisStr = '"' + "(TODO)" + '"';  
            break;  
        case NumberRecord.sid:  
            NumberRecord numrec = (NumberRecord) record;  
  
            curRow = thisRow = numrec.getRow();  
            thisColumn = numrec.getColumn();  
            value = formatListener.formatNumberDateCell(numrec).trim();  
            value = value.equals("")?"":value;  
            rowlist.add(value);
            break;  
        case RKRecord.sid:  
            RKRecord rkrec = (RKRecord) record;  
  
            thisRow = rkrec.getRow();  
            thisColumn = rkrec.getColumn();  
            thisStr = '"' + "(TODO)" + '"';  
            break;  
        default:  
            break;  
        }  
  
        // 遇到新行的操作  
        if (thisRow != -1 && thisRow != lastRowNumber) {  
            lastColumnNumber = -1;  
        }  
  
        // 空值的操作  
        if (record instanceof MissingCellDummyRecord) {  
            MissingCellDummyRecord mc = (MissingCellDummyRecord) record;  
            curRow = thisRow = mc.getRow();  
            thisColumn = mc.getColumn();  
//            rowlist.add(thisColumn," ");  
        }  
  
        // 更新行和列的值  
        if (thisRow > -1)  
            lastRowNumber = thisRow;  
        if (thisColumn > -1)  
            lastColumnNumber = thisColumn;  
  
        // 行结束时的操作  
        if (record instanceof LastCellOfRowDummyRecord) {  
            if (minColumns > 0) {  
                // 列值重新置空  
                if (lastColumnNumber == -1) {  
                    lastColumnNumber = 0;  
                }  
            }  
            // 行结束时, 调用 optRows() 方法  
            lastColumnNumber = -1;  
            try {  
                optRows(sheetIndex,curRow, rowlist);  
            } catch (Exception e) {  
                e.printStackTrace();  
            }  
            rowlist.clear();  
        }  
    }  
    public static void main(String[] args) throws Exception {
    	
		ParseXlsExcel excel = new ParseXlsExcel("F:\\srv\\tiptop\\crawl\\www.anxiang.gov.cn\\Excel\\3fbc7e659d1f42311ee89c6d47fac414.xls");
		excel.process();  
        int size = excel.getList().size();
        List<String> list2 = excel.getList();
        String reg = "楚源收储点";
        Pattern pattern = Pattern.compile(reg);
        int x = 0;
        for (int i = 0; i < size; i++) {
        	String replace = list2.get(i).trim().replace("","");
        	if(null!=replace&&!"".equals(replace)){
        		if(pattern.matcher(replace).find()) x+=1;	
        			System.out.println(replace);
        	} 
		}
        System.err.println(" 存在关键字【"+reg+"】的个数: "+x);
    }
}

你可能感兴趣的:(Excel大文件时读取内存溢出的解决方案)