使用jsoup解析html页面内容案例

 

public String getFaGuiKuTitles(String type, int page) {

        String href = "http://info.qd-n-tax.gov.cn/NewFaGuiKu/"+type+"/";

        String baseUrl = href + "index";

        

        int no = 0;

        String msg = "";

        

        if(page>0){

            baseUrl = baseUrl + "_"+page;

        }

        

        baseUrl += ".htm";

        

        int totalPage = 0;

        

        List<FaGui> list = new ArrayList<FaGui>();

        

        try {

            URL url = new URL(baseUrl);

            org.jsoup.nodes.Document doc = Jsoup.parse(url, 10000);

            

            org.jsoup.nodes.Element table = doc.select("table").get(0);

            org.jsoup.nodes.Element tbody = table.select("tbody").get(0);

            org.jsoup.select.Elements rows = tbody.select("tr");

            

            int len = rows.size();

            

            for (int i = 0; i < len; i++) {

                org.jsoup.select.Elements cols = rows.get(i).select("td");

                

                FaGui fg = new FaGui();

                fg.setTitle(cols.get(0).text());

                fg.setDate(cols.get(1).text());

                

                if(cols.size()>2){

                    fg.setFwzh(cols.get(2).text());

                }

                

                

                org.jsoup.nodes.Element a = cols.get(0).select("a").get(0);

                fg.setHref(a.attr("href").replaceFirst("./", href));

                

                list.add(fg);

            }

            

            //翻页信息

            String pager = doc.getElementsByClass("pager").get(0).html();

            int start = pager.indexOf("(")+1;

            int end = pager.indexOf(",");

            pager = pager.substring(start, end);//截取页面中的总页数

            

            if(pager.matches("\\d+")){

                totalPage = Integer.parseInt(pager);

            }

            

            no = 1;

            msg = "SUCCESS";

            

            log.info("获取税收法规库标题内容", "getFaGuiKuTitles");

        } catch (MalformedURLException ex) {

            Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex);

            msg = "获取税收法规库标题内容:baseUrl"+baseUrl+"不可用,ex:"+ex;

            log.error(msg, "getFaGuiKuTitles");

        } catch (IOException ex) {

            Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex);

            msg = "获取税收法规库标题内容:IO异常,ex:"+ex;

            log.error(msg, "getFaGuiKuTitles");

        }        

        

        return ResultUtil.getResult(no, msg, list,totalPage,page);

    }

 

你可能感兴趣的:(JSoup)