问题描述,使用htmlparser的lexer解析器进行页面解析时发现类似如下的页面会有问题:
<script> for(i=0;i<a;i++){ } </script> 解析后代码变成了: <script> for(i=0;i<a;i++){ } ></script>
通过lexer代码发现,实际上只要js代码改成:
<script> <!-- for(i=0;i<a;i++){ } --> </script>
就不会有问题了,从代码中主要发现它的解析其实没有问题,主要是我们平时的页面规范做的不好,它在解析时会看到字符解析时发现<后面如果有字母就认为它是一个tag:
protected Node parseString(int start, boolean quotesmart) throws ParserException { boolean done; char ch; char quote; done = false; quote = 0; while (!done) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if (0x1b == ch) // escape { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('$' == ch) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; // JIS X 0208-1978 and JIS X 0208-1983 else if ('@' == ch || 'B' == ch) scanJIS(mCursor); /* * // JIS X 0212-1990 else if ('(' == ch) { ch = * mPage.getCharacter (mCursor); if (Page.EOF == ch) done = * true; else if ('D' == ch) scanJIS (mCursor); else { * mPage.ungetCharacter (mCursor); mPage.ungetCharacter * (mCursor); mPage.ungetCharacter (mCursor); } } */ else { mPage.ungetCharacter(mCursor); mPage.ungetCharacter(mCursor); } } else mPage.ungetCharacter(mCursor); } else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) quote = ch; // enter quoted state // patch from Gernot Fricke to handle escaped closing quote else if (quotesmart && (0 != quote) && ('\\' == ch)) { ch = mPage.getCharacter(mCursor); // try to consume escape if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) mPage.ungetCharacter(mCursor); // unconsume char if char not // an escape } else if (quotesmart && (ch == quote)) quote = 0; // exit quoted state else if (quotesmart && (0 == quote) && (ch == '/')) { // handle multiline and double slash comments (with a quote) // in script like: // I can't handle single quotations. ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('/' == ch) { do ch = mPage.getCharacter(mCursor); while ((Page.EOF != ch) && ('\n' != ch)); } else if ('*' == ch) { do { do ch = mPage.getCharacter(mCursor); while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter(mCursor); if (ch == '*') mPage.ungetCharacter(mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else mPage.ungetCharacter(mCursor); } else if ((0 == quote) && ('<' == ch)) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; // the order of these tests might be optimized for speed: else if ('/' == ch || Character.isLetter(ch) || '!' == ch || '%' == ch || '?' == ch) { done = true; mPage.ungetCharacter(mCursor); mPage.ungetCharacter(mCursor); } else { // it's not a tag, so keep going, but check for quotes mPage.ungetCharacter(mCursor); } } } return (makeString(start, mCursor.getPosition())); }
因此为了解决这个问题,现在要在上面做一个手脚:
首先在类中间增加了一个标记,script
这个标记是修改了nexNode方法,在返回前判断下是否前一个标签是<script> 或者</script>
然后在parseString中修改其解析方法就可以了,下面是完整的代码:
import java.net.URLConnection; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.nodes.TagNode; import org.htmlparser.util.ParserException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author edwardpro * */ public class LexerFixed extends Lexer { private static final Logger logger = LoggerFactory .getLogger(LexerFixed.class); /** * */ private static final long serialVersionUID = 8425806017089419815L; //script标签标记,如果发现当前在script里就掠过所有的< > private int script=0; /** * */ public LexerFixed() { super(); } /** * @param page */ public LexerFixed(Page page) { super(page); } /** * @param text */ public LexerFixed(String text) { super(text); } /** * @param connection * @throws ParserException */ public LexerFixed(URLConnection connection) throws ParserException { super(connection); } @Override public Node nextNode(boolean quotesmart) throws ParserException { Node ret = super.nextNode(quotesmart); checkTag(ret); return (ret); } /** * checkTag用于修改tagNode的方法当有入参数时都会进行一次参数修正另外對內容進行一下escape操作並且會進行判斷是否存在已經escape的蹟象 * * @param node */ private void checkTag(Node node) { if (node != null && node instanceof TagNode && !((TagNode) node).isEmptyXmlTag()) { String tagName = ((TagNode) node).getTagName(); if("SCRIPT".equalsIgnoreCase(tagName)){ if (!((TagNode) node).isEndTag() ) { this.script=1; } else{ this.script=0; } } } } @Override protected Node parseString(int start, boolean quotesmart) throws ParserException { boolean done; char ch; char quote; done = false; quote = 0; while (!done) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if (0x1b == ch) // escape { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('$' == ch) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; // JIS X 0208-1978 and JIS X 0208-1983 else if ('@' == ch || 'B' == ch) scanJIS(mCursor); /* * // JIS X 0212-1990 else if ('(' == ch) { ch = * mPage.getCharacter (mCursor); if (Page.EOF == ch) done = * true; else if ('D' == ch) scanJIS (mCursor); else { * mPage.ungetCharacter (mCursor); mPage.ungetCharacter * (mCursor); mPage.ungetCharacter (mCursor); } } */ else { mPage.ungetCharacter(mCursor); mPage.ungetCharacter(mCursor); } } else mPage.ungetCharacter(mCursor); } else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) quote = ch; // enter quoted state // patch from Gernot Fricke to handle escaped closing quote else if (quotesmart && (0 != quote) && ('\\' == ch)) { ch = mPage.getCharacter(mCursor); // try to consume escape if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) mPage.ungetCharacter(mCursor); // unconsume char if char not // an escape } else if (quotesmart && (ch == quote)) quote = 0; // exit quoted state else if (quotesmart && (0 == quote) && (ch == '/')) { // handle multiline and double slash comments (with a quote) // in script like: // I can't handle single quotations. ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; else if ('/' == ch) { do ch = mPage.getCharacter(mCursor); while ((Page.EOF != ch) && ('\n' != ch)); } else if ('*' == ch) { do { do ch = mPage.getCharacter(mCursor); while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter(mCursor); if (ch == '*') mPage.ungetCharacter(mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else mPage.ungetCharacter(mCursor); } else if ((0 == quote) && ('<' == ch)) { ch = mPage.getCharacter(mCursor); if (Page.EOF == ch) done = true; // the order of these tests might be optimized for speed: else if ('/' == ch || (Character.isLetter(ch) && this.script==0) || '!' == ch || '%' == ch || '?' == ch) { done = true; mPage.ungetCharacter(mCursor); mPage.ungetCharacter(mCursor); } else { // it's not a tag, so keep going, but check for quotes mPage.ungetCharacter(mCursor); } } } return (makeString(start, mCursor.getPosition())); } }