需要htmlParser.jar。代码如下
public class GetData {
public static void main(String[] args) throws ParserException {
ArrayList<UrlEntity> result = getYellowList("http://www.71hy.com/hy-34-p1.html");
}
public static boolean getStringsByRegex(String txt, String regex) {//根据匹配规则判断是否 匹配
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(txt);
if (m.find()) {
return true;
}
return false;
}
public static ArrayList<UrlEntity> getYellowList(String url){
final ArrayList<UrlEntity> result = new ArrayList<UrlEntity>();
Parser parser = null;
NodeList nodeList = null;
try {
parser = new Parser(url);
parser.setEncoding("gb2312");//设置编码格式
nodeList = parser.parse(
new NodeFilter(){
@Override
public boolean accept(Node node) {
String regex="div class=\"box\"";// 类似 "div id=\"box\"","ulclass=\"box\""等
if(getStringsByRegex(node.getText(),regex)){//如果匹配你要的内容
try {
String html = node.toHtml();
Parser parser2 = Parser.createParser(html, "gb2312");
NodeFilter filter = new TagNameFilter("A");
// 在你需要的div 内 过滤出A 标签 返回NodeList
NodeList nodeList3 = parser2.extractAllNodesThatMatch(filter);
for (int i = 0; i<nodeList3.size(); i++) {
LinkTag tag = (LinkTag) nodeList3.elementAt(i);//将node转为相应的 标签
String des = tag.getLinkText();//获取a 标签的内容
if(!des.equals("") && (des.indexOf("[") == -1)){
UrlEntity PageData = new UrlEntity();
PageData.url = tag.getLink();//获取a 标签的href
PageData.title = tag.getAttribute("title");//根据属性获取需要的内容
PageData.description = des;
PageData.type = "Yellow";
result.add(PageData);
}
}
} catch (ParserException e) {
}
return true;
}
return false;
}
}
);
}catch (ParserException e) {
}
return result;
}
}