问题描述,使用htmlparser的lexer解析器进行页面解析时发现类似如下的页面会有问题:
解析后代码变成了:
通过lexer代码发现,实际上只要js代码改成:
就不会有问题了,从代码中主要发现它的解析其实没有问题,主要是我们平时的页面规范做的不好,它在解析时会看到字符解析时发现<后面如果有字母就认为它是一个tag:
protected Node parseString(int start, boolean quotesmart)
throws ParserException {
boolean done;
char ch;
char quote;
done = false;
quote = 0;
while (!done) {
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
else if (0x1b == ch) // escape
{
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
else if ('$' == ch) {
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
// JIS X 0208-1978 and JIS X 0208-1983
else if ('@' == ch || 'B' == ch)
scanJIS(mCursor);
/*
* // JIS X 0212-1990 else if ('(' == ch) { ch =
* mPage.getCharacter (mCursor); if (Page.EOF == ch) done =
* true; else if ('D' == ch) scanJIS (mCursor); else {
* mPage.ungetCharacter (mCursor); mPage.ungetCharacter
* (mCursor); mPage.ungetCharacter (mCursor); } }
*/
else {
mPage.ungetCharacter(mCursor);
mPage.ungetCharacter(mCursor);
}
} else
mPage.ungetCharacter(mCursor);
} else if (quotesmart && (0 == quote)
&& (('\'' == ch) || ('"' == ch)))
quote = ch; // enter quoted state
// patch from Gernot Fricke to handle escaped closing quote
else if (quotesmart && (0 != quote) && ('\\' == ch)) {
ch = mPage.getCharacter(mCursor); // try to consume escape
if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash
&& (ch != quote)) // escaped quote character
// ( reflects ["] or ['] whichever opened the quotation)
mPage.ungetCharacter(mCursor); // unconsume char if char not
// an escape
} else if (quotesmart && (ch == quote))
quote = 0; // exit quoted state
else if (quotesmart && (0 == quote) && (ch == '/')) {
// handle multiline and double slash comments (with a quote)
// in script like:
// I can't handle single quotations.
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
else if ('/' == ch) {
do
ch = mPage.getCharacter(mCursor);
while ((Page.EOF != ch) && ('\n' != ch));
} else if ('*' == ch) {
do {
do
ch = mPage.getCharacter(mCursor);
while ((Page.EOF != ch) && ('*' != ch));
ch = mPage.getCharacter(mCursor);
if (ch == '*')
mPage.ungetCharacter(mCursor);
} while ((Page.EOF != ch) && ('/' != ch));
} else
mPage.ungetCharacter(mCursor);
} else if ((0 == quote) && ('<' == ch)) {
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
// the order of these tests might be optimized for speed:
else if ('/' == ch
|| Character.isLetter(ch)
|| '!' == ch || '%' == ch || '?' == ch) {
done = true;
mPage.ungetCharacter(mCursor);
mPage.ungetCharacter(mCursor);
} else {
// it's not a tag, so keep going, but check for quotes
mPage.ungetCharacter(mCursor);
}
}
}
return (makeString(start, mCursor.getPosition()));
}
因此为了解决这个问题,现在要在上面做一个手脚:
首先在类中间增加了一个标记,script
这个标记是修改了nexNode方法,在返回前判断下是否前一个标签是
然后在parseString中修改其解析方法就可以了,下面是完整的代码:
import java.net.URLConnection;
import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.util.ParserException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author edwardpro
*
*/
public class LexerFixed extends Lexer {
private static final Logger logger = LoggerFactory
.getLogger(LexerFixed.class);
/**
*
*/
private static final long serialVersionUID = 8425806017089419815L;
//script标签标记,如果发现当前在script里就掠过所有的< >
private int script=0;
/**
*
*/
public LexerFixed() {
super();
}
/**
* @param page
*/
public LexerFixed(Page page) {
super(page);
}
/**
* @param text
*/
public LexerFixed(String text) {
super(text);
}
/**
* @param connection
* @throws ParserException
*/
public LexerFixed(URLConnection connection) throws ParserException {
super(connection);
}
@Override
public Node nextNode(boolean quotesmart) throws ParserException {
Node ret = super.nextNode(quotesmart);
checkTag(ret);
return (ret);
}
/**
* checkTag用于修改tagNode的方法当有入参数时都会进行一次参数修正另外對內容進行一下escape操作並且會進行判斷是否存在已經escape的蹟象
*
* @param node
*/
private void checkTag(Node node) {
if (node != null && node instanceof TagNode
&& !((TagNode) node).isEmptyXmlTag()) {
String tagName = ((TagNode) node).getTagName();
if("SCRIPT".equalsIgnoreCase(tagName)){
if (!((TagNode) node).isEndTag() ) {
this.script=1;
} else{
this.script=0;
}
}
}
}
@Override
protected Node parseString(int start, boolean quotesmart)
throws ParserException {
boolean done;
char ch;
char quote;
done = false;
quote = 0;
while (!done) {
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
else if (0x1b == ch) // escape
{
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
else if ('$' == ch) {
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
// JIS X 0208-1978 and JIS X 0208-1983
else if ('@' == ch || 'B' == ch)
scanJIS(mCursor);
/*
* // JIS X 0212-1990 else if ('(' == ch) { ch =
* mPage.getCharacter (mCursor); if (Page.EOF == ch) done =
* true; else if ('D' == ch) scanJIS (mCursor); else {
* mPage.ungetCharacter (mCursor); mPage.ungetCharacter
* (mCursor); mPage.ungetCharacter (mCursor); } }
*/
else {
mPage.ungetCharacter(mCursor);
mPage.ungetCharacter(mCursor);
}
} else
mPage.ungetCharacter(mCursor);
} else if (quotesmart && (0 == quote)
&& (('\'' == ch) || ('"' == ch)))
quote = ch; // enter quoted state
// patch from Gernot Fricke to handle escaped closing quote
else if (quotesmart && (0 != quote) && ('\\' == ch)) {
ch = mPage.getCharacter(mCursor); // try to consume escape
if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash
&& (ch != quote)) // escaped quote character
// ( reflects ["] or ['] whichever opened the quotation)
mPage.ungetCharacter(mCursor); // unconsume char if char not
// an escape
} else if (quotesmart && (ch == quote))
quote = 0; // exit quoted state
else if (quotesmart && (0 == quote) && (ch == '/')) {
// handle multiline and double slash comments (with a quote)
// in script like:
// I can't handle single quotations.
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
else if ('/' == ch) {
do
ch = mPage.getCharacter(mCursor);
while ((Page.EOF != ch) && ('\n' != ch));
} else if ('*' == ch) {
do {
do
ch = mPage.getCharacter(mCursor);
while ((Page.EOF != ch) && ('*' != ch));
ch = mPage.getCharacter(mCursor);
if (ch == '*')
mPage.ungetCharacter(mCursor);
} while ((Page.EOF != ch) && ('/' != ch));
} else
mPage.ungetCharacter(mCursor);
} else if ((0 == quote) && ('<' == ch)) {
ch = mPage.getCharacter(mCursor);
if (Page.EOF == ch)
done = true;
// the order of these tests might be optimized for speed:
else if ('/' == ch
|| (Character.isLetter(ch) && this.script==0)
|| '!' == ch || '%' == ch || '?' == ch) {
done = true;
mPage.ungetCharacter(mCursor);
mPage.ungetCharacter(mCursor);
} else {
// it's not a tag, so keep going, but check for quotes
mPage.ungetCharacter(mCursor);
}
}
}
return (makeString(start, mCursor.getPosition()));
}
}