package com.sundy.parse.util;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class ParseXlsxExcel extends DefaultHandler {
private OPCPackage opcPackage = null;
private XSSFReader xssfReader = null;
private boolean nextIsStr;
private String cellContent;
private ArrayList<String> sheetList;
private ArrayList<String> excelList;
private SharedStringsTable sst;
private int currentColumn = 0;
private XMLReader reader;
public ParseXlsxExcel(String path,int sheetId) throws IOException, OpenXML4JException, SAXException{
init(new File(path));
parseSheet(sheetId);
}
public ParseXlsxExcel(File file,int sheetId) throws IOException, OpenXML4JException, SAXException{
init(file);
parseSheet(sheetId);
}
public ParseXlsxExcel(File file,boolean flag) throws IOException, OpenXML4JException, SAXException{
initAll(file, flag);
}
public ParseXlsxExcel(String path,boolean flag) throws IOException, OpenXML4JException, SAXException{
initAll(new File(path), flag);
}
private void initAll(File file, boolean flag)throws InvalidFormatException, IOException, OpenXML4JException,SAXException {
init(file);
if(flag){
parseAllSheet();
}else {
parseSheet(1);
}
}
private void parseAllSheet() {
int x = 0;
int y = 0;
while(x==y){
try {
parseSheet(++x);
excelList.addAll(sheetList);
y=x;
currentColumn=0;
} catch (Exception e) {
}
}
}
private void init(File file) throws IOException, OpenXML4JException, SAXException{
opcPackage = OPCPackage.open((file), PackageAccess.READ);
sheetList = new ArrayList<String>();
excelList = new ArrayList<String>();
xssfReader = new XSSFReader(opcPackage);
sst = xssfReader.getSharedStringsTable();
reader = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
reader.setContentHandler(this);
}
public void close() throws IOException{
if(opcPackage!=null) opcPackage.close();
}
private void parseSheet(int sheetId) throws IOException, OpenXML4JException, SAXException{
sheetList.clear();
InputStream inStream = xssfReader.getSheet("rId"+sheetId);
InputSource inputSource = new InputSource(inStream);
reader.parse(inputSource);
}
@Override
public void startElement(String uri, String localName, String qName,Attributes attributes) throws SAXException {
if(qName.equals("c")){
String type = attributes.getValue("t");
if(type!=null && type.equals("s")){
nextIsStr = true;
}else {
nextIsStr = false;
}
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
cellContent = new String(ch, start, length);
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException{
if(nextIsStr&&cellContent!=""&&cellContent!=null){
try {
int idx = Integer.parseInt(cellContent);
cellContent = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
} catch (Exception e) {
}
}
if(qName.equals("v")){
sheetList.add(currentColumn++, cellContent);
cellContent="";
}
}
public ArrayList<String> getSheetList() {
return sheetList;
}
public ArrayList<String> getExcelList() {
return excelList;
}
public static void main(String[] args) throws Exception{
long start = System.currentTimeMillis();
ParseXlsxExcel excel = new ParseXlsxExcel("C:/Users/admin/Desktop/大师傅家的感觉.xlsx",true);
excel.close();
ArrayList<String> list = excel.getExcelList();
long end = System.currentTimeMillis();
System.out.println(list.size());
System.out.println(end-start);
}
}
package com.sundy.parse.util;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BlankRecord;
import org.apache.poi.hssf.record.BoolErrRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class ParseXlsExcel implements HSSFListener{
private int minColumns;
private POIFSFileSystem fs;
private int lastRowNumber;
private int lastColumnNumber;
private boolean outputFormulaValues = true;
private SheetRecordCollectingListener workbookBuildingListener;
private HSSFWorkbook stubWorkbook;
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
private int sheetIndex = -1;
private BoundSheetRecord[] orderedBSRs;
@SuppressWarnings("unchecked")
private ArrayList boundSheetRecords = new ArrayList();
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
private int curRow;
private List<String> rowlist;
@SuppressWarnings( "unused")
private String sheetName;
private List<String> contents = new ArrayList<String>();
public ParseXlsExcel(POIFSFileSystem fs) throws Exception {
this.fs = fs;
this.minColumns = -1;
this.curRow = 0;
this.rowlist = new ArrayList<String>();
}
public ParseXlsExcel(String filename) throws Exception {
this(new POIFSFileSystem(new FileInputStream(filename)));
}
public void optRows(int sheetIndex,int curRow, List<String> rowlist){
StringBuffer sb = new StringBuffer();
for (int i = 0 ;i< rowlist.size();i++){
String str = rowlist.get(i);
if(null!=str&&!"".equals(str)) sb.append(str.trim()+" ");
}
contents.add(sb.toString());
}
public List<String> getList() {
return contents;
}
public void process() throws IOException {
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
if (outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new SheetRecordCollectingListener(formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
factory.processWorkbookEvents(request, fs);
}
@SuppressWarnings("unchecked")
public void processRecord(Record record) {
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
String value = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
boundSheetRecords.add(record);
break;
case BOFRecord.sid:
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
if (workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
}
sheetIndex++;
if (orderedBSRs == null) {
orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
}
sheetName = orderedBSRs[sheetIndex].getSheetname();
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid:
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
thisStr = "";
break;
case BoolErrRecord.sid:
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = "";
break;
case FormulaRecord.sid:
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = formatListener.formatNumberDateCell(frec);
}
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
break;
case StringRecord.sid:
if (outputNextStringRecord) {
StringRecord srec = (StringRecord) record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
curRow = thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
value = lrec.getValue().trim();
value = value.equals("")?"":value;
rowlist.add(value);
break;
case LabelSSTRecord.sid:
LabelSSTRecord lsrec = (LabelSSTRecord) record;
curRow = thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (sstRecord != null){
value = sstRecord
.getString(lsrec.getSSTIndex()).toString().trim();
value = value.equals("")?"":value;
rowlist.add(value);
}
break;
case NoteRecord.sid:
NoteRecord nrec = (NoteRecord) record;
thisRow = nrec.getRow();
thisColumn = nrec.getColumn();
thisStr = '"' + "(TODO)" + '"';
break;
case NumberRecord.sid:
NumberRecord numrec = (NumberRecord) record;
curRow = thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
value = formatListener.formatNumberDateCell(numrec).trim();
value = value.equals("")?"":value;
rowlist.add(value);
break;
case RKRecord.sid:
RKRecord rkrec = (RKRecord) record;
thisRow = rkrec.getRow();
thisColumn = rkrec.getColumn();
thisStr = '"' + "(TODO)" + '"';
break;
default:
break;
}
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
curRow = thisRow = mc.getRow();
thisColumn = mc.getColumn();
}
if (thisRow > -1)
lastRowNumber = thisRow;
if (thisColumn > -1)
lastColumnNumber = thisColumn;
if (record instanceof LastCellOfRowDummyRecord) {
if (minColumns > 0) {
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
}
lastColumnNumber = -1;
try {
optRows(sheetIndex,curRow, rowlist);
} catch (Exception e) {
e.printStackTrace();
}
rowlist.clear();
}
}
public static void main(String[] args) throws Exception {
ParseXlsExcel excel = new ParseXlsExcel("F:\\srv\\tiptop\\crawl\\www.anxiang.gov.cn\\Excel\\3fbc7e659d1f42311ee89c6d47fac414.xls");
excel.process();
int size = excel.getList().size();
List<String> list2 = excel.getList();
String reg = "楚源收储点";
Pattern pattern = Pattern.compile(reg);
int x = 0;
for (int i = 0; i < size; i++) {
String replace = list2.get(i).trim().replace("","");
if(null!=replace&&!"".equals(replace)){
if(pattern.matcher(replace).find()) x+=1;
System.out.println(replace);
}
}
System.err.println(" 存在关键字【"+reg+"】的个数: "+x);
}
}