path是专业的xml结构化文档的查询语言,语法功能强大,本文不涉及xpath语法教程。
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据,但是选取某个元素时还是没有xpath那么简单直接,而且xpath带了很多选择库。
然而遗憾的时,jsoup并不支持xpath,于是博主就写了一个让jsoup支持的xpath的工具类,希望能帮助到有需要的朋友!
package com.ry.mytools.util;
import com.sun.org.apache.xerces.internal.dom.ElementImpl;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.*;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
* * Jsoup的xpath解析工具类
* *
* * @author liuhh
* *
*
*/
@SuppressWarnings("restriction")
public class JsoupParserUtil {
protected final static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private final static Logger log = LoggerFactory.getLogger(JsoupParserUtil.class);
private final static XPath xPath = XPathFactory.newInstance().newXPath();
protected static TransformerFactory tf = TransformerFactory.newInstance();
private static final Lock LOCK = new ReentrantLock();
/**
* 得到该节点的子节点个数
*/
public static int getEleChildNum(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODESET);
if (null != res && res instanceof NodeList) {
NodeList nodeList = (NodeList) res;
return nodeList == null ? 0 : nodeList.getLength();
}
} catch (Exception e) {
log.error("根据xpath:{},获取子节点个数出现错误,错误原因:" + e.getMessage(), xpath);
}
return 0;
}
/**
* 判断文档中是否存在xpath节点
*/
public static boolean exists(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.BOOLEAN);
if (null != res && res instanceof Boolean) {
return (boolean) res;
}
return false;
} catch (Exception e) {
log.error("检查xpath:{},是否存在时出现错误,!" + e.getMessage(), xpath);
}
return false;
}
/**
* 根据xpath得到w3c的Element对象
*/
public static ElementImpl getW3cElementImpl(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODE);
if (null != res && res instanceof ElementImpl) {
return (ElementImpl) res;
}
return null;
} catch (Exception e) {
log.error("根据xpath:{},得到w3c的Element对象出现错误,原因:" + e.getMessage(), xpath);
}
return null;
}
/**
* 根据xpath得到jsoup的Element对象
*/
public static org.jsoup.nodes.Element getJsoupElement(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODE);
if (null != res && res instanceof ElementImpl) {
ElementImpl elementImpl = (ElementImpl) res;
return getJsoupEle(elementImpl);
}
return null;
} catch (Exception e) {
log.error("根据xpath:{},得到jsoup的Element对象出现错误,原因:" + e.getMessage(), xpath);
}
return null;
}
/**
* 根据xpath得到jsoup的Elements对象
*/
public static Elements getJsoupElements(final org.jsoup.nodes.Element ele, final String xpath) {
try {
NodeList nodeList = getNodeList(ele, xpath);
if (null != nodeList && nodeList.getLength() > 0) {
int len = nodeList.getLength();
Elements elements = new Elements();
for (int i = 0; i < len; i++) {
Node node = nodeList.item(i);
if (null != node && node instanceof ElementImpl) {
org.jsoup.nodes.Element
element = getJsoupEle(((ElementImpl) node));
elements.add(element);
}
}
return elements;
}
} catch (Exception e) {
log.error("根据xpath:{},得到jsoup的Element对象出现错误,原因:" + e.getMessage(), xpath);
}
return null;
}
/**
* 从Jsoup的Element中解析出W3C的NodeList
*/
public static NodeList getNodeList(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODESET);
if (null != res && res instanceof NodeList) {
return (NodeList) res;
}
} catch (Exception e) {
log.error(e.getMessage(), e);
}
return null;
}
/**
* 得到节点的某一个属性
*/
public static String getXpathString(final org.jsoup.nodes.Element ele, final String xpath) {
try {
int textNum = getEleChildNum(ele, xpath);
if (1 == textNum) {
Object res = parse(ele, xpath, XPathConstants.STRING);
if (null != res) {
return res.toString();
}
} else {
List<String> res = getXpathListString(ele, xpath);
if (res != null && res.size() > 0) {
StringBuilder stringBuilder = new StringBuilder();
for (Iterator<String> iterator = res.iterator(); iterator.hasNext(); ) {
String text = iterator.next();
if (null != text) {
stringBuilder.append(text.replace("\r\n", "."));
}
}
return stringBuilder.toString();
}
}
return null;
} catch (Exception e) {
e.printStackTrace();
log.error("根据xpath:{}查询字符串时出现错误:" + e.getMessage(), xpath);
}
return null;
}
/**
* 查询字符串列表
*/
public static List<String> getXpathListString(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODESET);
if (null != res && res instanceof NodeList) {
NodeList nodeList = (NodeList) res;
int length = nodeList.getLength();
if (length <= 0) {
return null;
}
List<String> list = new ArrayList<>();
for (int i = 0; i < length; i++) {
Node node = nodeList.item(i);
list.add(null == node ? null : node.getNodeValue());
}
return list;
}
return null;
} catch (Exception e) {
log.error("根据xpath:{}查询字符串列表时出现错误:" + e.getMessage(), xpath);
}
return null;
}
/**
* 获取xpath解析结果
*/
public static Object parse(final org.jsoup.nodes.Element doc, final String xPathStr, final QName qName) {
Node node = fromJsoup(doc);
return parse(node, xPathStr, qName);
}
public static Object parse(final Node doc, final String xPathStr, final QName qName) {
try {
if (doc == null) {
log.warn("解析文档为null!");
return null;
}
if (StringUtils.isBlank(xPathStr)) {
log.warn("解析的Xpath路径为空!");
return null;
}
if (null == qName) {
log.warn("解析类型为null!");
return null;
}
try {
LOCK.lock();
Object res = xPath.evaluate(xPathStr, doc, qName);
return res;
} finally {
// TODO: handle finally clause
LOCK.unlock();
}
} catch (Exception e) {
log.warn("解析Xpath:{},出现错误,解析类型:{},错误原因:{}!", xPathStr, qName, e.getMessage());
}
return null;
}
/**
* 根据ElementImpl得到Jsoup的Element
*/
public static org.jsoup.nodes.Element getJsoupEle(final ElementImpl elementImpl) {
try {
String value = getW3cDocString(elementImpl);
org.jsoup.nodes.Document document = Jsoup.parse(value);
return document.body().child(0);
} catch (Exception e) {
// TODO: handle exception
log.error("根据ElementImpl得到Jsoup的Element出现错误,错误原因:" + e.getMessage());
return null;
}
}
/**
* 将w3c的Document转为jsoup的Document
*/
public static org.jsoup.nodes.Document fromW3C(final Document doc) throws Exception {
String string = getW3cDocString(doc);
org.jsoup.nodes.Document res = Jsoup.parse(string);
return res;
}
/**
* 将jsoup的Document转为w3c的Document
*/
public static Node fromJsoup(final org.jsoup.nodes.Element in) {
DocumentBuilder builder;
try {
if (null == in) {
return null;
}
builder = factory.newDocumentBuilder();
Document out = builder.newDocument();
if (in instanceof org.jsoup.nodes.Document) {
List<org.jsoup.nodes.Node> childs = in.childNodes();
if (childs != null && childs.size() > 0) {
org.jsoup.nodes.Element rootEl = in.child(0);
NodeTraversor
traversor = new NodeTraversor(new W3CBuilder(out));
traversor.traverse(rootEl);
return out;
} else {
// out.setNodeValue(in.);
return out;
}
} else if (in instanceof org.jsoup.nodes.Element) {
NodeTraversor
traversor = new NodeTraversor(new W3CBuilder(out));
traversor.traverse(in);
return out;
}
} catch (ParserConfigurationException e) {
return null;
}
return null;
}
/**
* 将W3c的doc转为字符串
*/
public static String getW3cDocString(final Node doc) throws Exception {
try (StringWriter writer = new StringWriter()) {
DOMSource domSource = new DOMSource(doc);
StreamResult result = new StreamResult(writer);
LOCK.lock();
try {
Transformer transformer = tf.newTransformer();
transformer.transform(domSource, result);
return writer.toString();
} finally {
LOCK.unlock();
}
} catch (TransformerException e) {
throw new IllegalStateException(e);
}
}
/**
* 将Jsoup的node属性拷贝到w3c的Element中
*/
public static void copyAttributes(final org.jsoup.nodes.Node source, final Element el) {
for (Attribute attribute : source.attributes()) {
el.setAttribute(attribute.getKey(), attribute.getValue());
}
}
}
class W3CBuilder implements NodeVisitor {
private final Document doc;
private Element dest;
public W3CBuilder(Document doc) {
this.doc = doc;
}
@Override
public void head(final org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element) {
org.jsoup.nodes.Element
sourceEl = (org.jsoup.nodes.Element) source;
Element el = doc.createElement(sourceEl.tagName());
JsoupParserUtil.copyAttributes(sourceEl, el);
if (dest == null) {
doc.appendChild(el);
} else {
dest.appendChild(el);
}
dest = el;
} else if (source instanceof org.jsoup.nodes.TextNode) {
org.jsoup.nodes.TextNode
sourceText = (org.jsoup.nodes.TextNode) source;
Text text = doc.createTextNode(sourceText.getWholeText());
dest.appendChild(text);
} else if (source instanceof org.jsoup.nodes.Comment) {
org.jsoup.nodes.Comment
sourceComment = (org.jsoup.nodes.Comment) source;
Comment comment = doc.createComment(sourceComment.getData());
dest.appendChild(comment);
} else if (source instanceof org.jsoup.nodes.DataNode) {
org.jsoup.nodes.DataNode
sourceData = (org.jsoup.nodes.DataNode) source;
Text node = doc.createTextNode(sourceData.getWholeData());
dest.appendChild(node);
} else {
}
}
@Override
public void tail(final org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
dest = (Element) dest.getParentNode();
}
}
}
import java.io.IOException;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class JsoupParserUtilsTest {
public static void main(String[] args) throws Exception, IOException {
String url = "http://mil.news.sina.com.cn/china/2016-09-29/doc-ifxwmamy9955666.shtml";
Document doc = Jsoup.parse(new URL(url), 10000);
String titleXpath = "//*[@id='main_title']/text()";
String timeXpath = "//*[@id='page-tools']/span/span[position() = 1]";
System.out.println(JsoupParserUtils.exists(doc, "/html/body/div[position>1000000]"));
System.out.println(JsoupParserUtils.getXpathString(doc, titleXpath));
Element element = JsoupParserUtils.getJsoupElement(doc, timeXpath);
System.out.println(element.text());
System.out.println(element.attr("class"));
}
}
————————————————
让你的Jsoup支持Xpath