The important point about Java HTML parsing is to use a parser designed for it. While you can parse HTML using the default XML parser, it's a brittle thing because it will only accept well formed, strict XHTML.
Hence, I highly recommend using the TagSoup library which slots right into the parsing framework but handles crappy HTML.
import java.net.URL; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl; public class HTMLParseExample { public static void main(String args[]) throws Exception { // print the 'src' attributes of <img> tags // from http://www.yahoo.com/ // using the TagSoup parser SAXParserImpl.newInstance(null).parse( new URL("http://www.yahoo.com/").openConnection().getInputStream(), new DefaultHandler() { public void startElement(String uri, String localName, String name, Attributes a) { if (name.equalsIgnoreCase("img")) System.out.println(a.getValue("src")); } } ); } }
And here's a slightly more complex example (collect and print the text inside nested <p>
tags), this time using the standard Java XML parser Xerxes instead of TagSoup.
import java.net.URL; import java.util.ArrayList; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.helpers.DefaultHandler; public class XHTMLParseExample { public static void main(String args[]) throws Exception { // print the text in <p> ... </p> tags on http://www.w3.org/ // using the standard Java XML parser, Xerxes javax.xml.parsers.SAXParserFactory.newInstance().newSAXParser().parse( new URL("http://www.w3.org/").openConnection().getInputStream(), new DefaultHandler() { ArrayList<StringBuilder> p = new ArrayList<StringBuilder>(); public void startElement(String uri, String localName, String name, Attributes a) { // push a string buffer for every <p> tag if (name.equalsIgnoreCase("p")) { p.add(new StringBuilder()); } } public void endElement(String uri, String localName, String name) { // pop and print a string buffer for every </p> tag if (name.equalsIgnoreCase("p")) { int lastIdx = p.size() - 1; System.out.print("PARA: " + p.remove(lastIdx)); } } public void characters(char[] ch, int start, int length) { // append any characters to the current string buffer int lastIdx = p.size() - 1; if (lastIdx > -1) { p.get(lastIdx).append(new String(ch, start, length)) .append(' '); } } // if we don't include a fake resolveEntity() method, Xerxes // will try to download the entity URI listed its cached DTD: // http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent public InputSource resolveEntity(String publicId, String systemId) throws org.xml.sax.SAXException, java.io.IOException { final String fake = "<!ENTITY nbsp \" \">"; return new InputSource(new java.io.StringReader(fake)); } } ); } }