NekoHtml解析 html 文件

最近做了一个 html 解析的 小项目,用的是 NekoHtml, 解析静态的html页面,提取需要的信息 成jason对象并放到一个文件中。

思路是, 先使用Netko可以快速的将需要的html中的指定标签如 table 中的信息拿到。 然后存入临时生成一个temp.html页面,再重新解析成 node对象。就可以根据结构获得制定的 node下的内容了。


核心代码如下:

public static CustomerRecord convertFileToObj(String filePath) throws Exception {
		CustomerRecord cr = new CustomerRecord();
		List<SOARec> soaList = new ArrayList<SOARec>();
		List<ARec> aList = new ArrayList<ARec>();
		List<MXRec> mxList = new ArrayList<MXRec>();
		List<NSRec> nxList = new ArrayList<NSRec>();
		
		// *Get Need Content from file
		File file = new File(filePath);
		cr.setFileName(file.getName());
		// create element remover filter
		ElementRemover remover = new ElementRemover();
		// set which elements to accept
		remover.acceptElement("table", null);
		remover.acceptElement("td", null);
		remover.acceptElement("tr", null);
		remover.removeElement("title");
		StringWriter filteredDescription = new StringWriter();
		// create writer filter
		org.cyberneko.html.filters.Writer writer = new org.cyberneko.html.filters.Writer(filteredDescription, null);
		// setup filter chain
		XMLDocumentFilter[] filters = { remover, writer, };
		// create HTML parser
		XMLParserConfiguration parser = new HTMLConfiguration();
		parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
		XMLInputSource source = new XMLInputSource(null, filePath, null);
		parser.parse(source);
		String description = filteredDescription.toString();
		Pattern p = Pattern.compile("\\s*|\t|\r|\n");
		Matcher m = p.matcher(description);
		description = m.replaceAll("");

		// * wirte the content into file
		File temp = new File(file.getParentFile().getPath(), "temp.html");
		Writer out = null;
		out = new FileWriter(temp, false);
		out.write(description);
		out.close();

		DOMParser parser2 = new DOMParser();
		parser2.parse(temp.getPath());
		Document document = parser2.getDocument();
		int a = 0;

		NodeList nodeList = XPathAPI.selectNodeList(document, "//TR");
		for (int i = 0; i < nodeList.getLength(); i++) {
			Node node = nodeList.item(i);
			String trContent = node.getTextContent();
			//System.out.println(trContent);
			
			//Start to convent into object.
			// Domain
			if (trContent.equals("ZoneundRecordsbearbeiten")) {
				a = i;
			}
		
			if (i == (a + 2)) {
				if (trContent.contains("Domain")) {
					cr.setDomain(node.getChildNodes().item(1).getTextContent());
				}
			}
			// SOA
			if (trContent.startsWith("SOARecord")) {
				NodeList soanodes = node.getChildNodes().item(1).getChildNodes().item(0).getChildNodes();
				for (int j = 1; j < soanodes.getLength(); j++) {
					SOARec soa = new SOARec();
					soa.setDomain(soanodes.item(j).getChildNodes().item(0).getTextContent());
					soa.setSeriennummer(soanodes.item(j).getChildNodes().item(1).getTextContent());
					soa.setEmail(soanodes.item(j).getChildNodes().item(2).getTextContent());
					soa.setPrimaryDNS(soanodes.item(j).getChildNodes().item(3).getTextContent());
					soaList.add(soa);
				}
			}
			// A
			if (trContent.startsWith("ARecords")) {
				NodeList anodes = node.getChildNodes().item(1).getChildNodes().item(0).getChildNodes();
				for (int j = 1; j < anodes.getLength(); j++) {
					ARec ar = new ARec();
					ar.setHost(anodes.item(j).getChildNodes().item(0).getTextContent());
					ar.setIp(anodes.item(j).getChildNodes().item(1).getTextContent());
					ar.setTtl(anodes.item(j).getChildNodes().item(2).getTextContent());
					aList.add(ar);
				}
			}
			// MXRecords
			if (trContent.startsWith("MXRecords")) {
				NodeList mxnodes = node.getChildNodes().item(1).getChildNodes().item(0).getChildNodes();
				for (int j = 1; j < mxnodes.getLength(); j++) {
					MXRec mx = new MXRec();
					mx.setHost(mxnodes.item(j).getChildNodes().item(0).getTextContent());
					mx.setMailExchanger(mxnodes.item(j).getChildNodes().item(1).getTextContent());
					mx.setTtl(mxnodes.item(j).getChildNodes().item(2).getTextContent());
					mx.setPreference(mxnodes.item(j).getChildNodes().item(3).getTextContent());
					mxList.add(mx);
				}
			}
			// NSRecords
			if (trContent.startsWith("NSRecords")) {
				NodeList nsnodes = node.getChildNodes().item(1).getChildNodes().item(0).getChildNodes();
				for (int j = 1; j < nsnodes.getLength(); j++) {
					NSRec ns = new NSRec();
					ns.setHost(nsnodes.item(j).getChildNodes().item(0).getTextContent());
					ns.setNameserver(nsnodes.item(j).getChildNodes().item(1).getTextContent());
					ns.setTtl(nsnodes.item(j).getChildNodes().item(2).getTextContent());
					nxList.add(ns);
				}
			}

		}
		cr.setaRecList(aList);
		cr.setMxRecList(mxList);
		cr.setSoaRecList(soaList);
		cr.setNxRecList(nxList);
		temp.delete();
		return cr;
	}

你可能感兴趣的:(html解析)