在html和xml的解析当中,有很多包可以使用,例如dom4j,jsoup等,归根到底,他们的解析都离不开dom树,都是将其转化为一棵dom树,一个document对象来实现的。接下来是一些解析的介绍
首先获取一个document对象
public Document parse(URL url) throws DocumentException {
SAXReader reader = new SAXReader();
//Document document = reader.read("src/Book.xml");//把xml文档加载到document对象中
Document document = reader.read(url);
return document;
}
Using Iterators
获取根节点
Element root = document.getRootElement();
// iterate through child elements of root
//遍历
for ( Iterator i = root.elementIterator(); i.hasNext(); ) {
Element element = (Element) i.next();
// do something
}
// iterate through child elements of root with element name "foo"
for ( Iterator i = root.elementIterator( "foo" ); i.hasNext(); ) {
Element foo = (Element) i.next();
// do something
}
// iterate through attributes of root
for ( Iterator i = root.attributeIterator(); i.hasNext(); ) {
Attribute attribute = (Attribute) i.next();
// do something
}
Powerful Navigation with XPath
List list = document.selectNodes( "//foo/bar" );
Node node = document.selectSingleNode( "//foo/bar/author" );
String name = node.valueOf( "@name" )
Creating a new XML document
Document document = DocumentHelper.createDocument();
Element root = document.addElement( "root" );
Element author1 = root.addElement( "author" )
.addAttribute( "name", "James" )
.addAttribute( "location", "UK" )
.addText( "James Strachan" );
Element author2 = root.addElement( "author" )
.addAttribute( "name", "Bob" )
.addAttribute( "location", "US" )
.addText( "Bob McWhirter" );
Writing a document to a file
// lets write to a file
XMLWriter writer = new XMLWriter(
new FileWriter( "output.xml" )
);
writer.write( document );
writer.close();
// Pretty print the document to System.out
OutputFormat format = OutputFormat.createPrettyPrint();
writer = new XMLWriter( System.out, format );
writer.write( document );
// Compact format to System.out
format = OutputFormat.createCompactFormat();
writer = new XMLWriter( System.out, format );
writer.write( document );
文档的跟多内容在这里:http://www.open-open.com/jsoup/selector-syntax.htm
//可以是这个是将String解析成为document
Document doc = Jsoup.parse(html);
//解析指定片段
Document doc = Jsoup.parseBodyFragment(html);
//你需要从一个网站获取和解析一个HTML文档
Document doc = Jsoup.connect("http://example.com/").get();
//其中还有post的请求方式,百度一下你就知道
//指定特定解码
Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
利用jsoup解析进行深度遍历和解析,来抽取特定的内容
创建一个类
public class TagStruct {
private Element e ;
private int deep;
private String xpath;
public TagStruct(Element e, int deep, String xpath) {
this.e = e;
this.deep = deep;
this.xpath = xpath;
}
}
解析的过程
//http://tieba.baidu.com/f?kw=c%E8%AF%AD%E8%A8%80&fr=index
Document doc = Jsoup.connect("http://tieba.baidu.com/f?kw=c%E8%AF%AD%E8%A8%80&fr=index").get();
List list = new ArrayList();
Stack sk = new Stack();
Elements allElements = doc.getAllElements();
Element child = doc.child(0);//html标签,跟标签
Element body = doc.body();
doc.siblingElements();
System.out.println(doc.getElementsByTag("a").size());
//广度遍历
TagStruct t = new TagStruct(body,1,"//body");
sk.push(t);
while (!sk.isEmpty()){
TagStruct pop = sk.pop();
Element e = pop.getE();
Elements elements = e.children();
for(int i=0;i comparator = new Comparator() {
public int compare(TagStruct o1, TagStruct o2) {
if(o1.getDeep()>o2.getDeep()){
return o1.getDeep()-o2.getDeep();
}else {
return o1.getDeep()-o2.getDeep();
}
}
};
Collections.sort(list,comparator);
//list.sort(comparator);
for(int i=0;i