使用SAX处理大XML文件 

之前一直都是使用dom4j来处理XML文件,比较方便,就好像使用htmlcleaner来处理html文件一样。

但dom4j解析xml文件有一个缺点,就是它需要加载整个文件,才能处理。对于大的XML文件 ,全部加载到内存并生成相应的结构时,40M的文件 进入内存需要消耗300多M的内存,性能极差。

后面使用SAX来解析大的XML文件,性能得到很大的提升。主要的实现代码如下:

 

public class ReadXml extends DefaultHandler { private Analyzer analyzer; private CheatCheck cheatCheck; private String currentKey; private final String inputFileName; private boolean isKeys = false; Map<String, Integer> keyNumMap = new HashMap<String, Integer>(); private final String keys[] = { "id", "title", "tags" }; private final Map<String, Object> video = new HashMap<String, Object>(); private WriteXml write; public ReadXml(String inputFileName) throws FileNotFoundException { this.inputFileName = inputFileName; File tempFile = new File(inputFileName); if (!tempFile.exists()) { throw new FileNotFoundException("文件不存在"); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { super.characters(ch, start, length); if (isKeys) { String v = new String(ch, start, length); video.put(currentKey, v.trim()); isKeys = false; } } @Override public void endDocument() throws SAXException { super.endDocument(); write.end(); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { super.endElement(uri, localName, qName); if (qName.equals("doc")) { keyNumMap.clear(); cheatCheck.handle(video, analyzer); write.write(video, "doc"); video.clear(); } } public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } public void setCheatCheck(CheatCheck cheatCheck) { this.cheatCheck = cheatCheck; } public void setWrite(WriteXml write) { this.write = write; } @Override public void startDocument() throws SAXException { super.startDocument(); if (write == null) { write = new WriteXml(System.currentTimeMillis() + ".out", "docs"); } if (analyzer == null) { analyzer = new SimpleAnalyzer(); } if (cheatCheck == null) { cheatCheck = new SimpleCheatCheck(); } } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { super.startElement(uri, localName, qName, attributes); isKeys = false; if (qName.equals("doc")) { video.clear(); return; } for (String key : keys) { if (key.equals(qName)) { currentKey = qName; isKeys = true; return; } } return; } }

你可能感兴趣的:(使用SAX处理大XML文件 )