htmlparser

今天一直在看两个纯js的parser,天哪我还是水平不够,继续努力。

第一个是 John Resig 写的比较完善的版本

 

  1 /*

  2  * HTML Parser By John Resig (ejohn.org)

  3  * Original code by Erik Arvidsson, Mozilla Public License

  4  * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js

  5  *

  6  * // Use like so:

  7  * HTMLParser(htmlString, {

  8  *     start: function(tag, attrs, unary) {},

  9  *     end: function(tag) {},

 10  *     chars: function(text) {},

 11  *     comment: function(text) {}

 12  * });

 13  *

 14  * // or to get an XML string:

 15  * HTMLtoXML(htmlString);

 16  *

 17  * // or to get an XML DOM Document

 18  * HTMLtoDOM(htmlString);

 19  *

 20  * // or to inject into an existing document/DOM node

 21  * HTMLtoDOM(htmlString, document);

 22  * HTMLtoDOM(htmlString, document.body);

 23  *

 24  */

 25 

 26 (function(){

 27 

 28     // Regular Expressions for parsing tags and attributes

 29     var startTag = /^<([-A-Za-z0-9_]+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,

 30         endTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/,

 31         attr = /([-A-Za-z0-9_]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;

 32         

 33     // Empty Elements - HTML 4.01

 34     var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");

 35 

 36     // Block Elements - HTML 4.01

 37     var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");

 38 

 39     // Inline Elements - HTML 4.01

 40     var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");

 41 

 42     // Elements that you can, intentionally, leave open

 43     // (and which close themselves)

 44     var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");

 45 

 46     // Attributes that have their values filled in disabled="disabled"

 47     var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");

 48 

 49     // Special Elements (can contain anything)

 50     var special = makeMap("script,style");

 51 

 52     var HTMLParser = this.HTMLParser = function( html, handler ) {

 53         var index, chars, match, stack = [], last = html;

 54         stack.last = function(){

 55             return this[ this.length - 1 ];

 56         };

 57 

 58         while ( html ) {

 59             chars = true;

 60 

 61             // Make sure we're not in a script or style element

 62             if ( !stack.last() || !special[ stack.last() ] ) {

 63 

 64                 // Comment

 65                 if ( html.indexOf("<!--") == 0 ) {

 66                     index = html.indexOf("-->");

 67     

 68                     if ( index >= 0 ) {

 69                         if ( handler.comment )

 70                             handler.comment( html.substring( 4, index ) );

 71                         html = html.substring( index + 3 );

 72                         chars = false;

 73                     }

 74     

 75                 // end tag

 76                 } else if ( html.indexOf("</") == 0 ) {

 77                     match = html.match( endTag );

 78     

 79                     if ( match ) {

 80                         html = html.substring( match[0].length );

 81                         match[0].replace( endTag, parseEndTag );

 82                         chars = false;

 83                     }

 84     

 85                 // start tag

 86                 } else if ( html.indexOf("<") == 0 ) {

 87                     match = html.match( startTag );

 88     

 89                     if ( match ) {

 90                         html = html.substring( match[0].length );

 91                         match[0].replace( startTag, parseStartTag );

 92                         chars = false;

 93                     }

 94                 }

 95 

 96                 if ( chars ) {

 97                     index = html.indexOf("<");

 98                     

 99                     var text = index < 0 ? html : html.substring( 0, index );

100                     html = index < 0 ? "" : html.substring( index );

101                     

102                     if ( handler.chars )

103                         handler.chars( text );

104                 }

105 

106             } else {

107                 html = html.replace(new RegExp("(.*)<\/" + stack.last() + "[^>]*>"), function(all, text){

108                     text = text.replace(/<!--(.*?)-->/g, "$1")

109                         .replace(/<!\[CDATA\[(.*?)]]>/g, "$1");

110 

111                     if ( handler.chars )

112                         handler.chars( text );

113 

114                     return "";

115                 });

116 

117                 parseEndTag( "", stack.last() );

118             }

119 

120             if ( html == last )

121                 throw "Parse Error: " + html;

122             last = html;

123         }

124         

125         // Clean up any remaining tags

126         parseEndTag();

127 

128         function parseStartTag( tag, tagName, rest, unary ) {

129             tagName = tagName.toLowerCase();

130 

131             if ( block[ tagName ] ) {

132                 while ( stack.last() && inline[ stack.last() ] ) {

133                     parseEndTag( "", stack.last() );

134                 }

135             }

136 

137             if ( closeSelf[ tagName ] && stack.last() == tagName ) {

138                 parseEndTag( "", tagName );

139             }

140 

141             unary = empty[ tagName ] || !!unary;

142 

143             if ( !unary )

144                 stack.push( tagName );

145             

146             if ( handler.start ) {

147                 var attrs = [];

148     

149                 rest.replace(attr, function(match, name) {

150                     var value = arguments[2] ? arguments[2] :

151                         arguments[3] ? arguments[3] :

152                         arguments[4] ? arguments[4] :

153                         fillAttrs[name] ? name : "";

154                     

155                     attrs.push({

156                         name: name,

157                         value: value,

158                         escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //"

159                     });

160                 });

161     

162                 if ( handler.start )

163                     handler.start( tagName, attrs, unary );

164             }

165         }

166 

167         function parseEndTag( tag, tagName ) {

168             // If no tag name is provided, clean shop

169             if ( !tagName )

170                 var pos = 0;

171                 

172             // Find the closest opened tag of the same type

173             else

174                 for ( var pos = stack.length - 1; pos >= 0; pos-- )

175                     if ( stack[ pos ] == tagName )

176                         break;

177             

178             if ( pos >= 0 ) {

179                 // Close all the open elements, up the stack

180                 for ( var i = stack.length - 1; i >= pos; i-- )

181                     if ( handler.end )

182                         handler.end( stack[ i ] );

183                 

184                 // Remove the open elements from the stack

185                 stack.length = pos;

186             }

187         }

188     };

189     

190     this.HTMLtoXML = function( html ) {

191         var results = "";

192         

193         HTMLParser(html, {

194             start: function( tag, attrs, unary ) {

195                 results += "<" + tag;

196         

197                 for ( var i = 0; i < attrs.length; i++ )

198                     results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';

199         

200                 results += (unary ? "/" : "") + ">";

201             },

202             end: function( tag ) {

203                 results += "</" + tag + ">";

204             },

205             chars: function( text ) {

206                 results += text;

207             },

208             comment: function( text ) {

209                 results += "<!--" + text + "-->";

210             }

211         });

212         

213         return results;

214     };

215     

216     this.HTMLtoDOM = function( html, doc ) {

217         // There can be only one of these elements

218         var one = makeMap("html,head,body,title");

219         

220         // Enforce a structure for the document

221         var structure = {

222             link: "head",

223             base: "head"

224         };

225     

226         if ( !doc ) {

227             if ( typeof DOMDocument != "undefined" )

228                 doc = new DOMDocument();

229             else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument )

230                 doc = document.implementation.createDocument("", "", null);

231             else if ( typeof ActiveX != "undefined" )

232                 doc = new ActiveXObject("Msxml.DOMDocument");

233             

234         } else

235             doc = doc.ownerDocument ||

236                 doc.getOwnerDocument && doc.getOwnerDocument() ||

237                 doc;

238         

239         var elems = [],

240             documentElement = doc.documentElement ||

241                 doc.getDocumentElement && doc.getDocumentElement();

242                 

243         // If we're dealing with an empty document then we

244         // need to pre-populate it with the HTML document structure

245         if ( !documentElement && doc.createElement ) (function(){

246             var html = doc.createElement("html");

247             var head = doc.createElement("head");

248             head.appendChild( doc.createElement("title") );

249             html.appendChild( head );

250             html.appendChild( doc.createElement("body") );

251             doc.appendChild( html );

252         })();

253         

254         // Find all the unique elements

255         if ( doc.getElementsByTagName )

256             for ( var i in one )

257                 one[ i ] = doc.getElementsByTagName( i )[0];

258         

259         // If we're working with a document, inject contents into

260         // the body element

261         var curParentNode = one.body;

262         

263         HTMLParser( html, {

264             start: function( tagName, attrs, unary ) {

265                 // If it's a pre-built element, then we can ignore

266                 // its construction

267                 if ( one[ tagName ] ) {

268                     curParentNode = one[ tagName ];

269                     if ( !unary ) {

270                         elems.push( curParentNode );

271                     }

272                     return;

273                 }

274             

275                 var elem = doc.createElement( tagName );

276                 

277                 for ( var attr in attrs )

278                     elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value );

279                 

280                 if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" )

281                     one[ structure[ tagName ] ].appendChild( elem );

282                 

283                 else if ( curParentNode && curParentNode.appendChild )

284                     curParentNode.appendChild( elem );

285                     

286                 if ( !unary ) {

287                     elems.push( elem );

288                     curParentNode = elem;

289                 }

290             },

291             end: function( tag ) {

292                 elems.length -= 1;

293                 

294                 // Init the new parentNode

295                 curParentNode = elems[ elems.length - 1 ];

296             },

297             chars: function( text ) {

298                 curParentNode.appendChild( doc.createTextNode( text ) );

299             },

300             comment: function( text ) {

301                 // create comment node

302             }

303         });

304         

305         return doc;

306     };

307 

308     function makeMap(str){

309         var obj = {}, items = str.split(",");

310         for ( var i = 0; i < items.length; i++ )

311             obj[ items[i] ] = true;

312         return obj;

313     }

314 })();
htmlparser

 

用法:

var results = "";

 

HTMLParser("<p id=test>hello <i>world", {

  start: function( tag, attrs, unary ) {

    results += "<" + tag;

 

    for ( var i = 0; i < attrs.length; i++ )

      results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';

 

    results += (unary ? "/" : "") + ">";

  },

  end: function( tag ) {

    results += "</" + tag + ">";

  },

  chars: function( text ) {

    results += text;

  },

  comment: function( text ) {

    results += "<!--" + text + "-->";

  }

});

 

results == '<p id="test">hello <i>world</i></p>"

然后John提到他是在 Erik Arvidsson 的基础上做的,

又去看了erik的版本

  1 // Copyright 2004 Erik Arvidsson. All Rights Reserved.

  2 //

  3 // This code is triple licensed using Apache Software License 2.0,

  4 // Mozilla Public License or GNU Public License

  5 //

  6 ///////////////////////////////////////////////////////////////////////////////

  7 //

  8 // Licensed under the Apache License, Version 2.0 (the "License"); you may not

  9 // use this file except in compliance with the License.  You may obtain a copy

 10 // of the License at http://www.apache.org/licenses/LICENSE-2.0

 11 //

 12 ///////////////////////////////////////////////////////////////////////////////

 13 //

 14 // The contents of this file are subject to the Mozilla Public License

 15 // Version 1.1 (the "License"); you may not use this file except in

 16 // compliance with the License. You may obtain a copy of the License at

 17 // http://www.mozilla.org/MPL/

 18 //

 19 // Software distributed under the License is distributed on an "AS IS"

 20 // basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the

 21 // License for the specific language governing rights and limitations

 22 // under the License.

 23 //

 24 // The Original Code is Simple HTML Parser.

 25 //

 26 // The Initial Developer of the Original Code is Erik Arvidsson.

 27 // Portions created by Erik Arvidssson are Copyright (C) 2004. All Rights

 28 // Reserved.

 29 //

 30 ///////////////////////////////////////////////////////////////////////////////

 31 //

 32 // This program is free software; you can redistribute it and/or

 33 // modify it under the terms of the GNU General Public License

 34 // as published by the Free Software Foundation; either version 2

 35 // of the License, or (at your option) any later version.

 36 //

 37 // This program is distributed in the hope that it will be useful,

 38 // but WITHOUT ANY WARRANTY; without even the implied warranty of

 39 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 40 // GNU General Public License for more details.

 41 //

 42 // You should have received a copy of the GNU General Public License

 43 // along with this program; if not, write to the Free Software

 44 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

 45 //

 46 ///////////////////////////////////////////////////////////////////////////////

 47 

 48 /*

 49 var handler ={

 50     startElement:   function (sTagName, oAttrs) {},

 51     endElement:     function (sTagName) {},

 52     characters:        function (s) {},

 53     comment:        function (s) {}

 54 };

 55 */

 56 

 57 function SimpleHtmlParser()

 58 {

 59 }

 60 

 61 SimpleHtmlParser.prototype = {

 62 

 63     handler:    null,

 64 

 65     // regexps

 66 

 67     startTagRe:    /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^"]*\")|(\'[^']*\')|[^>\s]+))?)*)\s*\/?\s*>/m,

 68     endTagRe:    /^<\/([^>\s]+)[^>]*>/m,

 69     attrRe:        /([^=\s]+)(\s*=\s*((\"([^"]*)\")|(\'([^']*)\')|[^>\s]+))?/gm,

 70 

 71     parse:    function (s, oHandler)

 72     {

 73         if (oHandler)

 74             this.contentHandler = oHandler;

 75 

 76         var i = 0;

 77         var res, lc, lm, rc, index;

 78         var treatAsChars = false;

 79         var oThis = this;

 80         while (s.length > 0)

 81         {

 82             // Comment

 83             if (s.substring(0, 4) == "<!--")

 84             {

 85                 index = s.indexOf("-->");

 86                 if (index != -1)

 87                 {

 88                     this.contentHandler.comment(s.substring(4, index));

 89                     s = s.substring(index + 3);

 90                     treatAsChars = false;

 91                 }

 92                 else

 93                 {

 94                     treatAsChars = true;

 95                 }

 96             }

 97 

 98             // end tag

 99             else if (s.substring(0, 2) == "</")

100             {

101                 if (this.endTagRe.test(s))

102                 {

103                     lc = RegExp.leftContext;

104                     lm = RegExp.lastMatch;

105                     rc = RegExp.rightContext;

106 

107                     lm.replace(this.endTagRe, function ()

108                     {

109                         return oThis.parseEndTag.apply(oThis, arguments);

110                     });

111 

112                     s = rc;

113                     treatAsChars = false;

114                 }

115                 else

116                 {

117                     treatAsChars = true;

118                 }

119             }

120             // start tag

121             else if (s.charAt(0) == "<")

122             {

123                 if (this.startTagRe.test(s))

124                 {

125                     lc = RegExp.leftContext;

126                     lm = RegExp.lastMatch;

127                     rc = RegExp.rightContext;

128 

129                     lm.replace(this.startTagRe, function ()

130                     {

131                         return oThis.parseStartTag.apply(oThis, arguments);

132                     });

133 

134                     s = rc;

135                     treatAsChars = false;

136                 }

137                 else

138                 {

139                     treatAsChars = true;

140                 }

141             }

142 

143             if (treatAsChars)

144             {

145                 index = s.indexOf("<");

146                 if (index == -1)

147                 {

148                      this.contentHandler.characters(s);

149                     s = "";

150                 }

151                 else

152                 {

153                     this.contentHandler.characters(s.substring(0, index));

154                     s = s.substring(index);

155                 }

156             }

157 

158             treatAsChars = true;

159         }

160     },

161 

162     parseStartTag:    function (sTag, sTagName, sRest)

163     {

164         var attrs = this.parseAttributes(sTagName, sRest);

165         this.contentHandler.startElement(sTagName, attrs);

166     },

167 

168     parseEndTag:    function (sTag, sTagName)

169     {

170         this.contentHandler.endElement(sTagName);

171     },

172 

173     parseAttributes:    function (sTagName, s)

174     {

175         var oThis = this;

176         var attrs = [];

177         s.replace(this.attrRe, function (a0, a1, a2, a3, a4, a5, a6)

178         {

179             attrs.push(oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6));

180         });

181         return attrs;

182     },

183 

184     parseAttribute: function (sTagName, sAttribute, sName)

185     {

186         var value = "";

187         if (arguments[7])

188             value = arguments[8];

189         else if (arguments[5])

190             value = arguments[6];

191         else if (arguments[3])

192             value = arguments[4];

193 

194         var empty = !value && !arguments[3];

195         return {name: sName, value: empty ? null : value};

196     }

197 };
ericparser

让我再折腾折腾。。。待续

 

你可能感兴趣的:(HtmlParser)