简单的html解析工具类

最近在做一个项目,要求解析html报告里的数据,报告的规格也都不一样。就写了个简单的工具类

用到的技术是结合了 jsoup、sipsoup包的使用


maven地址:

 

   
org.jsoup
jsoup
1.10.2
   

   
     com.virjar
         sipsoup

        RELEASE

   

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.virjar.sipsoup.exception.XpathSyntaxErrorException;
import com.virjar.sipsoup.parse.XpathParser;

/**   
* @Title: ParseHtmlUtil.java 
* @Package com.bluedon.track.util 
* @Description: 解析html漏洞数据工具类
* @author   
* @date 2017年7月12日 下午3:32:56 
* @version V1.0   
*/
public class ParseHtmlUtil {

	private final static Logger log = Logger.getLogger(ParseHtmlUtil.class);

	/**
	 * @Title: paserHtml
	 * @Description: 解析html漏洞数据
	 * @param filePath
	 *            解析的html地址
	 * @param rootName
	 *            解析的根节点名称
	 * @param map
	 *            key->名称,value->解析地址规则
	 * @return List>
	 * @throws IOException 
	 */
	public static List> paserHtml(File file, String rootName, Map map) throws IOException {
		List> list = new ArrayList>();
	  	Document doc = Jsoup.parse(file, "UTF-8");
			String rootPath = map.get(rootName).toString();
			map.remove(rootName);
			try {
				List eles=XpathParser.compile(rootPath).evaluateToElement(doc);
				if(null!=eles&&eles.size()>0){
				for (Element node  : eles) {
						if (null != map || map.size() >0) {
							Set keySet = map.keySet();
							Map maps=new HashMap();
							/*此情况是ips有多条数据的*/
							boolean manyIps=false;
							Set manyNames=new HashSet();
			 				for (String key : keySet) {
			 					String path=map.get(key).toString();
			 					/*过滤 路径为空的,仅仅是字母的  */
			 					if(path.equals("")||checkRegex(path,"^[a-z]*$")){
			 						maps.put(key,map.get(key).toString());
			 					}else if(checkRegex(path, "^previousElementSibling.*$")) {/*绿盟的特殊处理 同级的前一个节点 */
			 						maps = changeForGree(maps,key, node,path);
			 					}else if(checkRegex(path, "^nextElementSibling.*$")){
									maps = changeForGree2(maps,key, node,path);
								}else{
			 						if(getNodes(node,map.get(key).toString()).size()>1){
				 						manyIps=true;
				 						manyNames.add(key);
				 					}else{
				 						maps.put(key,replaceHtml(getNode(node,map.get(key).toString()).html()));
				 					}
			 					}
			 				}
			 				/*此情况是ips有多条数据的*/
			 				if(manyIps){
			 					for (String key : manyNames) {
			 						List nodelist=getNodes(node,map.get(key).toString());
			 						for(Element n : nodelist){
			 								maps.put(key,replaceHtml(n.html()));
			 								list.add(maps);
			 						}
			 					}
			 				}else{
			 					list.add(maps);
			 				}
						}
					}
				}
			} catch (XpathSyntaxErrorException e) {
				log.info("parse error !");
				e.printStackTrace();
			}
		return list;
	}

	public static Map changeForGree(Map map,String key, Element node,String path) {
		Element node1 = node.previousElementSibling();
		map.put(key, getNode(node1, path.replace("previousElementSibling", "")).html().toString());
		return map;
	}

	public static Map changeForGree2(Map map,String key, Element node,String path) {
		Element node1 = getNode(node, path.replace("nextElementSibling", ""));
		map.put(key,node1.nextElementSibling().html() );
		return map;
	}
	/**
	 * 替换掉HTML标签方法
	 */
	public static String replaceHtml(String html) {
		if ("".equals(html)){
			return "";
		}
		String regEx = "<.+?>";
		Pattern p = Pattern.compile(regEx);
		Matcher m = p.matcher(html);
		String s = m.replaceAll("");
		return s;
	}
	
	public static boolean checkRegex(String value,String regex){
		Pattern p=Pattern.compile(regex); 
		Matcher matcher = p.matcher(value);	
		boolean rs = matcher.matches();
		return rs;
	}
	
	public static Element getNode(Element node,String xpath){
		Element ele=new Element("null");
			List list;
			try {
				list = XpathParser.compile(xpath).evaluateToElement(node);
				for (Element jxNode: list) {
		        	return jxNode;
		        }
			} catch (XpathSyntaxErrorException e) {
				e.printStackTrace();
			}
		return ele;
	}

	public static List getNodes(Element node,String xpath){
		List jxNodeList=new ArrayList();
			try {
				jxNodeList = XpathParser.compile(xpath).evaluateToElement(node);
			} catch (com.virjar.sipsoup.exception.XpathSyntaxErrorException e) {
				e.printStackTrace();
			}
		return jxNodeList;
	}
public static void main(String[] args) throws Exception {
		//
		File file = new File("F:/OneKeyDownLoads/index.html");
		Document doc = Jsoup.parse(file, "UTF-8");

		Map map = new HashMap();
                /*根节点  用来循环的*/
		map.put("rootpath",
				"//table[@id='vulDataTable']/tbody/tr[@class='even vh_ip']|//tr[@class='even vm_ip']|//tr[@class='even vl_ip']");
		// map.put("desc", "//td//table/tbody/tr[2]/td[2]");
		// map.put("solution","//td//table/tbody/tr[3]/td[2]");
		// map.put("cve","//td//table/tbody/tr[7]/td[2]/a");
		// map.put("cnnvd","//td//table/tbody/tr/td[text()*='CNNVD-']");
		// map.put("cnvd","//td//table/tbody/tr/td[text()*='CNVD-']");
		// map.put("type","host");
//		 map.put("name", "previousElementSibling//td[1]/a");
		 map.put("risk", "nextElementSibling//td//table/tbody/tr/td[text()*='威胁分值']");
//		 map.put("host", "//td//table/tbody/tr[1]/td[2]/a");
		List> list = paserHtml(file, "rootpath", map);
		System.out.println(JSON.toJSON(list));
		System.out.println(list.size());
		//System.out.println(checkRegex("previousElementSibling//td[1]/a", "^previousElementSibling.*$"));
	}}
 
  
 
 

你可能感兴趣的:(技术闲谈)