今天一直在看两个纯js的parser,天哪我还是水平不够,继续努力。
第一个是 John Resig 写的比较完善的版本
1 /* 2 * HTML Parser By John Resig (ejohn.org) 3 * Original code by Erik Arvidsson, Mozilla Public License 4 * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js 5 * 6 * // Use like so: 7 * HTMLParser(htmlString, { 8 * start: function(tag, attrs, unary) {}, 9 * end: function(tag) {}, 10 * chars: function(text) {}, 11 * comment: function(text) {} 12 * }); 13 * 14 * // or to get an XML string: 15 * HTMLtoXML(htmlString); 16 * 17 * // or to get an XML DOM Document 18 * HTMLtoDOM(htmlString); 19 * 20 * // or to inject into an existing document/DOM node 21 * HTMLtoDOM(htmlString, document); 22 * HTMLtoDOM(htmlString, document.body); 23 * 24 */ 25 26 (function(){ 27 28 // Regular Expressions for parsing tags and attributes 29 var startTag = /^<([-A-Za-z0-9_]+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/, 30 endTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/, 31 attr = /([-A-Za-z0-9_]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g; 32 33 // Empty Elements - HTML 4.01 34 var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed"); 35 36 // Block Elements - HTML 4.01 37 var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul"); 38 39 // Inline Elements - HTML 4.01 40 var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var"); 41 42 // Elements that you can, intentionally, leave open 43 // (and which close themselves) 44 var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr"); 45 46 // Attributes that have their values filled in disabled="disabled" 47 var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); 48 49 // Special Elements (can contain anything) 50 var special = makeMap("script,style"); 51 52 var HTMLParser = this.HTMLParser = function( html, handler ) { 53 var index, chars, match, stack = [], last = html; 54 stack.last = function(){ 55 return this[ this.length - 1 ]; 56 }; 57 58 while ( html ) { 59 chars = true; 60 61 // Make sure we're not in a script or style element 62 if ( !stack.last() || !special[ stack.last() ] ) { 63 64 // Comment 65 if ( html.indexOf("<!--") == 0 ) { 66 index = html.indexOf("-->"); 67 68 if ( index >= 0 ) { 69 if ( handler.comment ) 70 handler.comment( html.substring( 4, index ) ); 71 html = html.substring( index + 3 ); 72 chars = false; 73 } 74 75 // end tag 76 } else if ( html.indexOf("</") == 0 ) { 77 match = html.match( endTag ); 78 79 if ( match ) { 80 html = html.substring( match[0].length ); 81 match[0].replace( endTag, parseEndTag ); 82 chars = false; 83 } 84 85 // start tag 86 } else if ( html.indexOf("<") == 0 ) { 87 match = html.match( startTag ); 88 89 if ( match ) { 90 html = html.substring( match[0].length ); 91 match[0].replace( startTag, parseStartTag ); 92 chars = false; 93 } 94 } 95 96 if ( chars ) { 97 index = html.indexOf("<"); 98 99 var text = index < 0 ? html : html.substring( 0, index ); 100 html = index < 0 ? "" : html.substring( index ); 101 102 if ( handler.chars ) 103 handler.chars( text ); 104 } 105 106 } else { 107 html = html.replace(new RegExp("(.*)<\/" + stack.last() + "[^>]*>"), function(all, text){ 108 text = text.replace(/<!--(.*?)-->/g, "$1") 109 .replace(/<!\[CDATA\[(.*?)]]>/g, "$1"); 110 111 if ( handler.chars ) 112 handler.chars( text ); 113 114 return ""; 115 }); 116 117 parseEndTag( "", stack.last() ); 118 } 119 120 if ( html == last ) 121 throw "Parse Error: " + html; 122 last = html; 123 } 124 125 // Clean up any remaining tags 126 parseEndTag(); 127 128 function parseStartTag( tag, tagName, rest, unary ) { 129 tagName = tagName.toLowerCase(); 130 131 if ( block[ tagName ] ) { 132 while ( stack.last() && inline[ stack.last() ] ) { 133 parseEndTag( "", stack.last() ); 134 } 135 } 136 137 if ( closeSelf[ tagName ] && stack.last() == tagName ) { 138 parseEndTag( "", tagName ); 139 } 140 141 unary = empty[ tagName ] || !!unary; 142 143 if ( !unary ) 144 stack.push( tagName ); 145 146 if ( handler.start ) { 147 var attrs = []; 148 149 rest.replace(attr, function(match, name) { 150 var value = arguments[2] ? arguments[2] : 151 arguments[3] ? arguments[3] : 152 arguments[4] ? arguments[4] : 153 fillAttrs[name] ? name : ""; 154 155 attrs.push({ 156 name: name, 157 value: value, 158 escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //" 159 }); 160 }); 161 162 if ( handler.start ) 163 handler.start( tagName, attrs, unary ); 164 } 165 } 166 167 function parseEndTag( tag, tagName ) { 168 // If no tag name is provided, clean shop 169 if ( !tagName ) 170 var pos = 0; 171 172 // Find the closest opened tag of the same type 173 else 174 for ( var pos = stack.length - 1; pos >= 0; pos-- ) 175 if ( stack[ pos ] == tagName ) 176 break; 177 178 if ( pos >= 0 ) { 179 // Close all the open elements, up the stack 180 for ( var i = stack.length - 1; i >= pos; i-- ) 181 if ( handler.end ) 182 handler.end( stack[ i ] ); 183 184 // Remove the open elements from the stack 185 stack.length = pos; 186 } 187 } 188 }; 189 190 this.HTMLtoXML = function( html ) { 191 var results = ""; 192 193 HTMLParser(html, { 194 start: function( tag, attrs, unary ) { 195 results += "<" + tag; 196 197 for ( var i = 0; i < attrs.length; i++ ) 198 results += " " + attrs[i].name + '="' + attrs[i].escaped + '"'; 199 200 results += (unary ? "/" : "") + ">"; 201 }, 202 end: function( tag ) { 203 results += "</" + tag + ">"; 204 }, 205 chars: function( text ) { 206 results += text; 207 }, 208 comment: function( text ) { 209 results += "<!--" + text + "-->"; 210 } 211 }); 212 213 return results; 214 }; 215 216 this.HTMLtoDOM = function( html, doc ) { 217 // There can be only one of these elements 218 var one = makeMap("html,head,body,title"); 219 220 // Enforce a structure for the document 221 var structure = { 222 link: "head", 223 base: "head" 224 }; 225 226 if ( !doc ) { 227 if ( typeof DOMDocument != "undefined" ) 228 doc = new DOMDocument(); 229 else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument ) 230 doc = document.implementation.createDocument("", "", null); 231 else if ( typeof ActiveX != "undefined" ) 232 doc = new ActiveXObject("Msxml.DOMDocument"); 233 234 } else 235 doc = doc.ownerDocument || 236 doc.getOwnerDocument && doc.getOwnerDocument() || 237 doc; 238 239 var elems = [], 240 documentElement = doc.documentElement || 241 doc.getDocumentElement && doc.getDocumentElement(); 242 243 // If we're dealing with an empty document then we 244 // need to pre-populate it with the HTML document structure 245 if ( !documentElement && doc.createElement ) (function(){ 246 var html = doc.createElement("html"); 247 var head = doc.createElement("head"); 248 head.appendChild( doc.createElement("title") ); 249 html.appendChild( head ); 250 html.appendChild( doc.createElement("body") ); 251 doc.appendChild( html ); 252 })(); 253 254 // Find all the unique elements 255 if ( doc.getElementsByTagName ) 256 for ( var i in one ) 257 one[ i ] = doc.getElementsByTagName( i )[0]; 258 259 // If we're working with a document, inject contents into 260 // the body element 261 var curParentNode = one.body; 262 263 HTMLParser( html, { 264 start: function( tagName, attrs, unary ) { 265 // If it's a pre-built element, then we can ignore 266 // its construction 267 if ( one[ tagName ] ) { 268 curParentNode = one[ tagName ]; 269 if ( !unary ) { 270 elems.push( curParentNode ); 271 } 272 return; 273 } 274 275 var elem = doc.createElement( tagName ); 276 277 for ( var attr in attrs ) 278 elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value ); 279 280 if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" ) 281 one[ structure[ tagName ] ].appendChild( elem ); 282 283 else if ( curParentNode && curParentNode.appendChild ) 284 curParentNode.appendChild( elem ); 285 286 if ( !unary ) { 287 elems.push( elem ); 288 curParentNode = elem; 289 } 290 }, 291 end: function( tag ) { 292 elems.length -= 1; 293 294 // Init the new parentNode 295 curParentNode = elems[ elems.length - 1 ]; 296 }, 297 chars: function( text ) { 298 curParentNode.appendChild( doc.createTextNode( text ) ); 299 }, 300 comment: function( text ) { 301 // create comment node 302 } 303 }); 304 305 return doc; 306 }; 307 308 function makeMap(str){ 309 var obj = {}, items = str.split(","); 310 for ( var i = 0; i < items.length; i++ ) 311 obj[ items[i] ] = true; 312 return obj; 313 } 314 })();
用法:
var results = ""; HTMLParser("<p id=test>hello <i>world", { start: function( tag, attrs, unary ) { results += "<" + tag; for ( var i = 0; i < attrs.length; i++ ) results += " " + attrs[i].name + '="' + attrs[i].escaped + '"'; results += (unary ? "/" : "") + ">"; }, end: function( tag ) { results += "</" + tag + ">"; }, chars: function( text ) { results += text; }, comment: function( text ) { results += "<!--" + text + "-->"; } }); results == '<p id="test">hello <i>world</i></p>"
然后John提到他是在 Erik Arvidsson 的基础上做的,
又去看了erik的版本
1 // Copyright 2004 Erik Arvidsson. All Rights Reserved. 2 // 3 // This code is triple licensed using Apache Software License 2.0, 4 // Mozilla Public License or GNU Public License 5 // 6 /////////////////////////////////////////////////////////////////////////////// 7 // 8 // Licensed under the Apache License, Version 2.0 (the "License"); you may not 9 // use this file except in compliance with the License. You may obtain a copy 10 // of the License at http://www.apache.org/licenses/LICENSE-2.0 11 // 12 /////////////////////////////////////////////////////////////////////////////// 13 // 14 // The contents of this file are subject to the Mozilla Public License 15 // Version 1.1 (the "License"); you may not use this file except in 16 // compliance with the License. You may obtain a copy of the License at 17 // http://www.mozilla.org/MPL/ 18 // 19 // Software distributed under the License is distributed on an "AS IS" 20 // basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 21 // License for the specific language governing rights and limitations 22 // under the License. 23 // 24 // The Original Code is Simple HTML Parser. 25 // 26 // The Initial Developer of the Original Code is Erik Arvidsson. 27 // Portions created by Erik Arvidssson are Copyright (C) 2004. All Rights 28 // Reserved. 29 // 30 /////////////////////////////////////////////////////////////////////////////// 31 // 32 // This program is free software; you can redistribute it and/or 33 // modify it under the terms of the GNU General Public License 34 // as published by the Free Software Foundation; either version 2 35 // of the License, or (at your option) any later version. 36 // 37 // This program is distributed in the hope that it will be useful, 38 // but WITHOUT ANY WARRANTY; without even the implied warranty of 39 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 40 // GNU General Public License for more details. 41 // 42 // You should have received a copy of the GNU General Public License 43 // along with this program; if not, write to the Free Software 44 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 45 // 46 /////////////////////////////////////////////////////////////////////////////// 47 48 /* 49 var handler ={ 50 startElement: function (sTagName, oAttrs) {}, 51 endElement: function (sTagName) {}, 52 characters: function (s) {}, 53 comment: function (s) {} 54 }; 55 */ 56 57 function SimpleHtmlParser() 58 { 59 } 60 61 SimpleHtmlParser.prototype = { 62 63 handler: null, 64 65 // regexps 66 67 startTagRe: /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^"]*\")|(\'[^']*\')|[^>\s]+))?)*)\s*\/?\s*>/m, 68 endTagRe: /^<\/([^>\s]+)[^>]*>/m, 69 attrRe: /([^=\s]+)(\s*=\s*((\"([^"]*)\")|(\'([^']*)\')|[^>\s]+))?/gm, 70 71 parse: function (s, oHandler) 72 { 73 if (oHandler) 74 this.contentHandler = oHandler; 75 76 var i = 0; 77 var res, lc, lm, rc, index; 78 var treatAsChars = false; 79 var oThis = this; 80 while (s.length > 0) 81 { 82 // Comment 83 if (s.substring(0, 4) == "<!--") 84 { 85 index = s.indexOf("-->"); 86 if (index != -1) 87 { 88 this.contentHandler.comment(s.substring(4, index)); 89 s = s.substring(index + 3); 90 treatAsChars = false; 91 } 92 else 93 { 94 treatAsChars = true; 95 } 96 } 97 98 // end tag 99 else if (s.substring(0, 2) == "</") 100 { 101 if (this.endTagRe.test(s)) 102 { 103 lc = RegExp.leftContext; 104 lm = RegExp.lastMatch; 105 rc = RegExp.rightContext; 106 107 lm.replace(this.endTagRe, function () 108 { 109 return oThis.parseEndTag.apply(oThis, arguments); 110 }); 111 112 s = rc; 113 treatAsChars = false; 114 } 115 else 116 { 117 treatAsChars = true; 118 } 119 } 120 // start tag 121 else if (s.charAt(0) == "<") 122 { 123 if (this.startTagRe.test(s)) 124 { 125 lc = RegExp.leftContext; 126 lm = RegExp.lastMatch; 127 rc = RegExp.rightContext; 128 129 lm.replace(this.startTagRe, function () 130 { 131 return oThis.parseStartTag.apply(oThis, arguments); 132 }); 133 134 s = rc; 135 treatAsChars = false; 136 } 137 else 138 { 139 treatAsChars = true; 140 } 141 } 142 143 if (treatAsChars) 144 { 145 index = s.indexOf("<"); 146 if (index == -1) 147 { 148 this.contentHandler.characters(s); 149 s = ""; 150 } 151 else 152 { 153 this.contentHandler.characters(s.substring(0, index)); 154 s = s.substring(index); 155 } 156 } 157 158 treatAsChars = true; 159 } 160 }, 161 162 parseStartTag: function (sTag, sTagName, sRest) 163 { 164 var attrs = this.parseAttributes(sTagName, sRest); 165 this.contentHandler.startElement(sTagName, attrs); 166 }, 167 168 parseEndTag: function (sTag, sTagName) 169 { 170 this.contentHandler.endElement(sTagName); 171 }, 172 173 parseAttributes: function (sTagName, s) 174 { 175 var oThis = this; 176 var attrs = []; 177 s.replace(this.attrRe, function (a0, a1, a2, a3, a4, a5, a6) 178 { 179 attrs.push(oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6)); 180 }); 181 return attrs; 182 }, 183 184 parseAttribute: function (sTagName, sAttribute, sName) 185 { 186 var value = ""; 187 if (arguments[7]) 188 value = arguments[8]; 189 else if (arguments[5]) 190 value = arguments[6]; 191 else if (arguments[3]) 192 value = arguments[4]; 193 194 var empty = !value && !arguments[3]; 195 return {name: sName, value: empty ? null : value}; 196 } 197 };
让我再折腾折腾。。。待续