java获取页面html,使用jsoup清除标签里面的标签的方法 例如<p><strong>heihei</strong></p><p><strong>哈哈哈哈哈</strong></p>

做一个党史今天的功能
java获取页面html,使用jsoup清除标签里面的标签的方法 例如<p><strong>heihei</strong></p><p><strong>哈哈哈哈哈</strong></p>_第1张图片

1.我是通过一下方法获取到html

 //解析 党史今天  https://topics.gmw.cn/node_137686.htm
     public static String httpsRequest1(String requestUrl, String requestMethod, String outputStr) {
        StringBuffer buffer = new StringBuffer();
        try {
// 创建SSLContext对象,并使用我们指定的信任管理器初始化
            TrustManager[] tm = {new X509TrustManager1()};
            SSLContext sslContext = SSLContext.getInstance("SSL", "SunJSSE");
            sslContext.init(null, tm, new java.security.SecureRandom());
// 从上述SSLContext对象中得到SSLSocketFactory对象
            SSLSocketFactory ssf = sslContext.getSocketFactory();

            URL url = new URL(requestUrl);
            HttpsURLConnection httpUrlConn = (HttpsURLConnection) url.openConnection();
            httpUrlConn.setSSLSocketFactory(ssf);

            httpUrlConn.setDoOutput(true);
            httpUrlConn.setDoInput(true);
            httpUrlConn.setUseCaches(false);
// 设置请求方式(GET/POST)
            httpUrlConn.setRequestMethod(requestMethod);

            if ("GET".equalsIgnoreCase(requestMethod))
                httpUrlConn.connect();

// 当有数据需要提交时
            if (null != outputStr) {
                OutputStream outputStream = httpUrlConn.getOutputStream();
// 注意编码格式,防止中文乱码
                outputStream.write(outputStr.getBytes("UTF-8"));
                outputStream.close();
            }

// 将返回的输入流转换成字符串
            InputStream inputStream = httpUrlConn.getInputStream();
            InputStreamReader inputStreamReader = new InputStreamReader(inputStream, "utf-8");
            BufferedReader bufferedReader = new BufferedReader(inputStreamReader);

            String str = null;
            while ((str = bufferedReader.readLine()) != null) {
                buffer.append(str);
            }
            bufferedReader.close();
            inputStreamReader.close();
// 释放资源
            inputStream.close();
            inputStream = null;
            httpUrlConn.disconnect();
        } catch (ConnectException ce) {
            ce.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return buffer.toString();
    }

2.使用jsoup 解析页面代码

 public static void main(String[] args) {
        String url = "https://topics.gmw.cn/node_137686.htm";
        String result = UrlUtil.httpsRequest1(url, "GET", null);
        Document doc = Jsoup.parse(result); //整个html
        Elements es = doc.select(".m_r_main .m_con p strong");// 获取要删除的标签里面的内容
       Elements s = es.select("strong").remove();  //执行删除操作
        Elements es2 = doc.select(".m_r_main .m_con p ");//再获取你想要的内容

        List historyTodays= new ArrayList();
        for(Element e:es2){
            if(!StringUtil.isEmpty(e.text())){
            HistoryToday today = new HistoryToday();
            today.setLink(url);
            today.setTitle(e.text().trim());
            historyTodays.add(today);
            }
        }
        System.out.println(historyTodays.toString());
    }

你可能感兴趣的:(java)