HTML Parser 简述:这是一个在 SourceForge.net 上比较活跃的项目之一,目前的最新版本是 1.6 发行版, (我现在用在 自己NBA网站上 的也是1.6).他是一个对现有的 HTML 进行分析的快速实时的解析器,事实上在应用过程中你更为惊叹于 HTML Parser 给你带来一些周到的处理。他主要用在这几个方面:
文本信息抽取,
链接提取,用于自动给页面的链接文本加上链接的标签
资源提取,例如对一些图片、声音的资源的处理
链接检查,用于检查HTML中的链接是否有效
页面内容的监控
呵呵.废话少说:)上代码.
public
class
BaseAction
{
public static final Logger logger = Logger.getLogger(BaseAction.class);
public String keyWords = "姚明|姚明NBA";
public static NodeList getAllNodeList(String urlOrfile, NodeFilter filter) {
if (logger.isDebugEnabled())
logger.debug("BaseAction getAllNodeList(" + urlOrfile + ")");
Parser parser;
try {
parser = new Parser(urlOrfile);
parser.setEncoding(Constent.Encode);
NodeList list = parser.parse(filter);
return list;
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
/**
* 取link 与textnode 返回的是href连接
*
* @param file
* @param filter
* @return
*/
public List<String> parseLink(String file, NodeFilter filter) {
if (logger.isDebugEnabled())
logger.debug("BaseAction parseLink(" + file + ")");
List<String> hrefList = new ArrayList<String>();
try {
NodeList nodelist = getAllNodeList(file, filter);
if(nodelist==null)
return null;
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
logger.debug("textnode=" + line);
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
logger.debug("link=" + line);
}
if (HttpParserUtil.isTrimEmpty(line))
continue;
hrefList.add(line);
}
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return hrefList;
}
/**
* 取link 与textnode 返回的是href连接
*
* @param file
* @param filter
* @return
*/
public Map<String, String> parseLinkWithText(String file,
NodeFilter filter, Pattern pHtml, Pattern pPhp) {
if (logger.isDebugEnabled())
logger.debug("SinaAction parseLinkWithText(" + file + ")");
Map<String, String> map = new HashMap<String, String>();
List<String> list = new ArrayList<String>();
try {
NodeList nodelist = getAllNodeList(file, filter);
if(nodelist==null)
return null;
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
if (HttpParserUtil.isTrimEmpty(line))
continue;
if (logger.isDebugEnabled())
logger.debug("textnode=" + line);
list.add(line);
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
if (HttpParserUtil.isTrimEmpty(line))
continue;
if (logger.isDebugEnabled())
logger.debug("link=" + line);
list.add(line);
}
}
int endPostion = list.size();
for (int i = 0; i < endPostion; i++) {
String getCurr = list.get(i);
Matcher mHtml = pHtml.matcher(getCurr);
Matcher mPhp = pPhp.matcher(getCurr);
if ((mHtml.matches() == true || mPhp.matches() == true)
&& i < (endPostion - 1)) {
String getNext = list.get(i + 1);
Matcher mHtmlNext = pHtml.matcher(getNext);
Matcher mPhpNext = pPhp.matcher(getNext);
if ((mHtml.matches() == true && mHtmlNext.matches() == false)
|| (mPhp.matches() == true && mPhpNext.matches() == false)) {
map.put(getCurr, getNext);
i = i + 1;
} else {
}
}
}
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return map;
}
/**
* 分析内容
*
* @param list
* @return
*/
public String parserContent(NodeList list) {
return parserContent(list, false);
}
public String parserContent(NodeList list, boolean isCreateFile) {
return parserContent(list,isCreateFile,list.size()+1);
}
public String parserContent(NodeList list, int listIndex) {
return parserContent(list,false,listIndex);
}
public String parserContent(NodeList list, boolean isCreateFile,int listIndex) {
if (logger.isDebugEnabled())
logger.debug("BaseAction parserContent()");
StringBuffer content = new StringBuffer();
if(list.size() < listIndex){//说明是整个取出进行rex
for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i);
NodeList sublist = node.getChildren();
if (sublist == null)
continue;
Node[] listNode = sublist.toNodeArray();
for (Node inNode : listNode) {
if (HttpParserUtil.isTrimEmpty(inNode.getText()))
continue;
logger.debug(inNode.toHtml());
content.append(inNode.toHtml());
if (isCreateFile)
content.append("\n");
}
}
}else{
Node node = list.elementAt(listIndex);
if (node == null){
logger.warn("the listIndex may is wrong! please do it");
return null;
}
NodeList sublist = node.getChildren();
if (sublist == null){
logger.warn("the listIndex may is wrong! please do it");
return null;
}
Node[] listNode
public static final Logger logger = Logger.getLogger(BaseAction.class);
public String keyWords = "姚明|姚明NBA";
public static NodeList getAllNodeList(String urlOrfile, NodeFilter filter) {
if (logger.isDebugEnabled())
logger.debug("BaseAction getAllNodeList(" + urlOrfile + ")");
Parser parser;
try {
parser = new Parser(urlOrfile);
parser.setEncoding(Constent.Encode);
NodeList list = parser.parse(filter);
return list;
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
/**
* 取link 与textnode 返回的是href连接
*
* @param file
* @param filter
* @return
*/
public List<String> parseLink(String file, NodeFilter filter) {
if (logger.isDebugEnabled())
logger.debug("BaseAction parseLink(" + file + ")");
List<String> hrefList = new ArrayList<String>();
try {
NodeList nodelist = getAllNodeList(file, filter);
if(nodelist==null)
return null;
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
logger.debug("textnode=" + line);
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
logger.debug("link=" + line);
}
if (HttpParserUtil.isTrimEmpty(line))
continue;
hrefList.add(line);
}
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return hrefList;
}
/**
* 取link 与textnode 返回的是href连接
*
* @param file
* @param filter
* @return
*/
public Map<String, String> parseLinkWithText(String file,
NodeFilter filter, Pattern pHtml, Pattern pPhp) {
if (logger.isDebugEnabled())
logger.debug("SinaAction parseLinkWithText(" + file + ")");
Map<String, String> map = new HashMap<String, String>();
List<String> list = new ArrayList<String>();
try {
NodeList nodelist = getAllNodeList(file, filter);
if(nodelist==null)
return null;
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
if (HttpParserUtil.isTrimEmpty(line))
continue;
if (logger.isDebugEnabled())
logger.debug("textnode=" + line);
list.add(line);
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
if (HttpParserUtil.isTrimEmpty(line))
continue;
if (logger.isDebugEnabled())
logger.debug("link=" + line);
list.add(line);
}
}
int endPostion = list.size();
for (int i = 0; i < endPostion; i++) {
String getCurr = list.get(i);
Matcher mHtml = pHtml.matcher(getCurr);
Matcher mPhp = pPhp.matcher(getCurr);
if ((mHtml.matches() == true || mPhp.matches() == true)
&& i < (endPostion - 1)) {
String getNext = list.get(i + 1);
Matcher mHtmlNext = pHtml.matcher(getNext);
Matcher mPhpNext = pPhp.matcher(getNext);
if ((mHtml.matches() == true && mHtmlNext.matches() == false)
|| (mPhp.matches() == true && mPhpNext.matches() == false)) {
map.put(getCurr, getNext);
i = i + 1;
} else {
}
}
}
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return map;
}
/**
* 分析内容
*
* @param list
* @return
*/
public String parserContent(NodeList list) {
return parserContent(list, false);
}
public String parserContent(NodeList list, boolean isCreateFile) {
return parserContent(list,isCreateFile,list.size()+1);
}
public String parserContent(NodeList list, int listIndex) {
return parserContent(list,false,listIndex);
}
public String parserContent(NodeList list, boolean isCreateFile,int listIndex) {
if (logger.isDebugEnabled())
logger.debug("BaseAction parserContent()");
StringBuffer content = new StringBuffer();
if(list.size() < listIndex){//说明是整个取出进行rex
for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i);
NodeList sublist = node.getChildren();
if (sublist == null)
continue;
Node[] listNode = sublist.toNodeArray();
for (Node inNode : listNode) {
if (HttpParserUtil.isTrimEmpty(inNode.getText()))
continue;
logger.debug(inNode.toHtml());
content.append(inNode.toHtml());
if (isCreateFile)
content.append("\n");
}
}
}else{
Node node = list.elementAt(listIndex);
if (node == null){
logger.warn("the listIndex may is wrong! please do it");
return null;
}
NodeList sublist = node.getChildren();
if (sublist == null){
logger.warn("the listIndex may is wrong! please do it");
return null;
}
Node[] listNode