使用HTMLParser分析discuz帖子页中所有帖子(主题帖及回帖)

使用HTMLParser分析discuz帖子页中所有帖子(主题帖及回帖)

package com.rupeng.search.discuz;

import java.net.URLConnection;

import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.tags.Div;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.htmlparser.visitors.NodeVisitor;

public class DiscuzDefaultStyleHTMLParser
{
 private String title;
 private String bodyText;

 public DiscuzDefaultStyleHTMLParser(URLConnection urlConnection) throws ParserException
 {
  Parser parser = new Parser(urlConnection);
  HtmlPage visitor = new HtmlPage(parser);
  parser.visitAllNodesWith(visitor);
  this.title = visitor.getTitle();
  NodeList nodeList = visitor.getBody();
  final StringBuffer sb = new StringBuffer();
  nodeList.visitAllNodesWith(new NodeVisitor() {

   @Override
   public void visitTag(Tag tag)
   {
    //因为主题、回帖都是包含在Div里,而且主题、回帖的divid都是以“postmessage_”开头
    if (tag instanceof Div)
    {
     Div div = (Div) tag;
     String divId = div.getAttribute("id");
     if (divId != null&& divId.startsWith("postmessage_"))
     {
      sb.append(div.getStringText());
     }
    }
   }
  });

  this.bodyText = sb.toString();
 }

 public String getTitle()
 {
  return title;
 }

 public String getThreadText()
 {
  return bodyText;
 }
}

你可能感兴趣的:(使用HTMLParser分析discuz帖子页中所有帖子(主题帖及回帖))