上篇文章已经讲解了怎么使用httpClient模拟登陆并获取指定路径页面的内容,获取到页面内容后,我们就可以使用HtmlParser解析页面中对我们有用的数据供我们自己使用了。哈哈
1、解析html中table的数据保存到数据库中
//htmlStr为使用HttpClient获取到页面的内容,querySql为查询数据是否重复语句(使用hashcode),saveSql插入数据语句
public void parseData(String htmlStr,String querySql,String saveSql){
HashMap<String, Object> query = new HashMap<String, Object>();
// String htmlStr = robot.readTextFile(path, "UTF-8");
Parser myParser;
NodeList nodeList = null;
myParser = Parser.createParser(htmlStr,"UTF-8");
//获取table标签
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
try {
//html结构中所有的table集合
nodeList = myParser.parse(lastFilter);
StringBuffer hashStr = new StringBuffer();
for (int i = 0; i <= nodeList.size(); i++) {
if (nodeList.elementAt(i) instanceof TableTag) {
//获取html结构中表格的所有内容
TableTag tag = (TableTag) nodeList.elementAt(i);
if(tag.getAttribute("style") != null){
if(tag.getAttribute("style").equalsIgnoreCase("table-layout: fixed;")){
//获取表格中行集合
TableRow[] rows = tag.getRows();
//j=0 为第一行,j=1为第二行。从第二行开始循环,因为第一行为表头(一般情况是从第一行开始循环)
for (int j = 1; j < rows.length; j++) {
TableRow tr = (TableRow) rows[j];
//获取行中的列集合
TableColumn[] td = tr.getColumns();
for (int k = 0; k < td.length; k++) {
//列的数据顺序和页面上的数据顺序一致。如:content_url、content_ip、content_time
// logger.info(td[k].toPlainTextString().trim()+"\n");
hashStr.append(td[k].toPlainTextString().trim());
query.put("content_"+k, td[k].toPlainTextString().trim());
//如果有超级链接时,取出子路径
if(td[k].getChildrenHTML().indexOf("<a href") != -1){
Pattern p = Pattern.compile("<a[^<>]*?\\shref=['\"]?(.*?)['\"]?\\s.*?>");
Matcher m = p.matcher(td[k].getChildrenHTML().trim());
if(m.find()){
query.put("child_url", m.group(1));
//保存病毒详细信息
if(m.group(1).indexOf("vss_virus_report.action") != -1){
HashMap<String, Object> childquery = new HashMap<String, Object>();
String virus_name = m.group(1).split("=")[1];
//判断病毒详细信息中是否有重复的数据
if(!repeatData(virus_name.hashCode(),"alarm.ipmanage.virusdetails.repeatdata")){
childquery.put("tid", GenerateSerial.getUUID());
childquery.put("virus_name", virus_name);
childquery.put("hashcode", virus_name.hashCode());
String url = "http://124.238.214.79/platform/"+m.group(1).split("/")[1].replaceAll(" ", "%20");
childquery.put("content", robot.get(url));
service.saveObjects("alarm.ipmanage.savevirusdetails", childquery);
}
}
//域名挂马详细信息
if(m.group(1).indexOf("websiteSecurity.action") != -1){
String url = "http://124.238.214.79/platform/pages/"+m.group(1);
String htmlStrDetails = robot.get(url);
if(htmlStrDetails != ""){
this.parseDomainData(htmlStrDetails, "alarm.ipmanage.domaindetails.repeatdata", "alarm.ipmanage.savedomaindetails");
}
}
}
}
}
//判断数据中是否已有重复的数据
if(repeatData(hashStr.toString().hashCode(),querySql)){
//有重复数据跳出循环,不插入数据库
//清空值,继续判断下一行数据
hashStr.delete(0, hashStr.length());
continue;
}else{
query.put("tid", GenerateSerial.getUUID());
query.put("hashcode", hashStr.toString().hashCode());
//把采集的信息保存到数据库中
service.saveObjects(saveSql, query);
//清空值,继续判断下一行数据
hashStr.delete(0, hashStr.length());
}
}
}
}
}
}
} catch (Exception e) {
logger.info("------------->解析html文件失败!");
}
}
2、有时页面有好多页,需要知道总页数,然后循环解析数据并保存
public int getTotalPages(String htmlStr) {
// String htmlStr = robot.readTextFile(path, "UTF-8");
Parser myParser;
NodeList nodeList = null;
int totalPages = 0;
myParser = Parser.createParser(htmlStr,"UTF-8");
//获取input标签
NodeFilter inputFilter = new NodeClassFilter(InputTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { inputFilter });
try {
//html结构中所有input集合
nodeList = myParser.parse(lastFilter);
for (int i = 0; i <= nodeList.size(); i++) {
if(nodeList.elementAt(i) instanceof InputTag){
//获取html结构中input标签的所有内容
InputTag inputTag = (InputTag)nodeList.elementAt(i);
if(inputTag.getAttribute("id") != null){
if(inputTag.getAttribute("id").equalsIgnoreCase("total")){
// logger.info("-------------------------->"+inputTag.getAttribute("value"));
totalPages = Integer.parseInt(inputTag.getAttribute("value"));
}
}
}
}
} catch (Exception e) {
logger.info("------------------->解析总页数失败!");
}
return totalPages;
}
3、解析html中的超级链接
public void parseHref(String path,String querySql,String saveSql){
String htmlStr = robot.readTextFile(path, "UTF-8");
//没有<body>元素不能解析
htmlStr = "<body>"+htmlStr+"</body>";
// 创建Parser对象根据传给字符串和指定
Parser parser = Parser.createParser(htmlStr, "UTF-8");
// 创建HtmlPage对象HtmlPage(Parser parser)
HtmlPage page = new HtmlPage(parser);
try {
parser.visitAllNodesWith(page);
} catch (ParserException e) {
e.printStackTrace();
}
// 所有的节点
NodeList nodelist = page.getBody();
// 建立一个节点filter用于过滤节点
NodeFilter filter = new TagNameFilter("A");
// 得到所有过滤后,想要的节点
nodelist = nodelist.extractAllNodesThatMatch(filter, true);
for (int i = 0; i < nodelist.size(); i++) {
LinkTag link = (LinkTag) nodelist.elementAt(i);
// 链接地址
logger.info(link.getAttribute("href") + "\n");
// 链接名称
logger.info(link.getStringText());
}
}
4、采集数据,调用解析方法
//系统登录,采用HttpClient登录,登录之后才能采集
boolean logging = robot.login();
if(logging){
String wssHistoryUrl = "http://124.238.214.79/platform/pages/getWssHistory.action?startDate="+startDate+"&endDate="+endDate+"&pageContext.currentpage=1";
//根据路径采集url列表数据
String htmlStr1 = robot.get(wssHistoryUrl);
//先获取总页数
int wss = this.getTotalPages(htmlStr1);
for (int i = 0; i <= wss; i++) {
//解析采集到的数据,然后插入数据库
this.parseData(robot.getText2("http://124.238.214.79/platform/pages/getWssHistory.action?startDate="+startDate+"&endDate="+endDate+"&pageContext.currentpage="+i), "alarm.ipmanage.url.repeatdata", "alarm.ipmanage.saveurl");
}
}