java解析html

目录

  • 场景描述
  • 一.引入依赖
  • 二.调用接口响应回来的html
  • 三.测试代码

场景描述

我调用外部接口,但是返回来的数据是html的格式,所以我就需要进行处理来获得我想要的数据。我使用的是jsoup

一.引入依赖

<dependency>
    <groupId>org.jsoupgroupId>
    <artifactId>jsoupartifactId>
    <version>1.11.3version>
dependency>

二.调用接口响应回来的html

DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
<title>无标题文档title>
<style type="text/css">
body, div, dl, dt, dd, ul, ol, li, h1, h2, h3, h4, h5, h6, pre, form, fieldset, input, textarea, p, blockquote, th, td {
	font-family: "微软雅黑"!important;
}
style>
<script type="text/javascript" src="https://www.ikun.com.cn/statics/js/jquery-1.7.2.js">script>
head>
<body>
<link rel="stylesheet" href="https://www.ikun.com.cn/statics/css/list.css" />
<link rel="stylesheet" href="https://www.ikun.com.cn/statics/css/base.css" />

<div class="w870" style="background:#fff">
  <div class="suipin"> <a href="https://www.ikun.com.cn">首页a>><b class="mowei"><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=lists&catid=194">智库a> > <a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=lists&catid=201">港口a> > <a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=lists&catid=226">ikun日记a> >  列表b> div>
  
  <table width="100%" class="mtzktab" id="mtzktab">
    <thead>
      <tr class="biaoti">
        <th width="70%" align="left">ikun日记th>
       
        <th width="30%">发布时间th>
      tr>
    thead>
    <tbody>
    
                                             

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=235904" target="_blank">ikun日记(第468期)a>td>
     
      <td align="center">2023-08-21td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=235638" target="_blank">ikun日记(第467期)a>td>
     
      <td align="center">2023-08-14td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=235402" target="_blank">ikun日记(第466期)a>td>
     
      <td align="center">2023-08-07td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=235224" target="_blank">ikun日记(第465期)a>td>
     
      <td align="center">2023-07-31td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=235047" target="_blank">ikun日记(第464期)a>td>
     
      <td align="center">2023-07-24td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=235043" target="_blank">ikun日记(第463期)a>td>
     
      <td align="center">2023-07-17td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=234716" target="_blank">ikun日记(第462期)a>td>
     
      <td align="center">2023-07-10td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=234535" target="_blank">ikun日记(第461期)a>td>
     
      <td align="center">2023-07-03td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=234531" target="_blank">ikun日记(第460期)a>td>
     
      <td align="center">2023-06-19td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=234100" target="_blank">ikun日记(第459期)a>td>
     
      <td align="center">2023-06-12td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=233842" target="_blank">ikun日记(第458期)a>td>
     
      <td align="center">2023-06-05td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=233838" target="_blank">ikun日记(第457期)a>td>
     
      <td align="center">2023-05-29td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=233551" target="_blank">ikun日记(第456期)a>td>
     
      <td align="center">2023-05-22td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=233279" target="_blank">ikun日记(第455期)a>td>
     
      <td align="center">2023-05-15td>
    tr>
                                         

    <tr>
      <td><div class="dian">.div><a href="https://www.ikun.com.cn/index.php?m=content&c=index&a=show&catid=226&id=233087" target="_blank">ikun日记(第454期)a>td>
     
      <td align="center">2023-04-29td>
    tr>
          tbody>
    
  table>
  <div id="pages"> <a class="a1">368条a> <a href="index.php?m=content&c=index&a=lists&catid=226" class="a1">上一页a> <span>1span> <a href="index.php?m=content&c=index&a=lists&catid=226&page=2">2a> <a href="index.php?m=content&c=index&a=lists&catid=226&page=3">3a> <a href="index.php?m=content&c=index&a=lists&catid=226&page=4">4a> <a href="index.php?m=content&c=index&a=lists&catid=226&page=5">5a> <a href="index.php?m=content&c=index&a=lists&catid=226&page=6">6a> <a href="index.php?m=content&c=index&a=lists&catid=226&page=7">7a> <a href="index.php?m=content&c=index&a=lists&catid=226&page=8">8a> <a href="index.php?m=content&c=index&a=lists&catid=226&page=9">9a> <a href="index.php?m=content&c=index&a=lists&catid=226&page=10">10a> ..<a href="index.php?m=content&c=index&a=lists&catid=226&page=25">25a> <a href="index.php?m=content&c=index&a=lists&catid=226&page=2" class="a1">下一页a> div>
   
  <script type="text/javascript">
                $().ready(function () {
                    $('#mtzktab tbody tr:odd').css('background', '#eeeeee'); 
                });
  script> 
div>
body>
html>

java解析html_第1张图片

三.测试代码

package org.jeecg.modules.mt.controller;

import cn.hutool.http.HttpUtil;
import org.jsoup.Jsoup;
import org.springframework.util.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;

public class Test {



    public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException {

        String apiUrl = "https://www.ikun.com.cn/index.php?m=content&c=index&a=lists&catid=226&page=1";

        // 发送get请求
        String body = HttpUtil.createPost(apiUrl).execute().body();
        Document doc = Jsoup.parse(body);

        //获取标签下的
标签,并把
标签的值改为空 Elements headingsDiv = doc.select("tbody").select("div"); for (Element element : headingsDiv) { element.text(""); } //获取标签下的标签 Elements headings = doc.select("tbody").select("td"); //循环获取的标签 for (int i = 0; i < headings.size(); i++) { //获取标签内的值 String text = headings.get(i).text(); //因为包含两个不同的标签,所以需要取余 if (i%2 == 0){ //获取标签下的标签的 href 属性的值 String href = headings.get(i).select("a").attr("href"); System.out.println("文件下载地址:"+href); System.out.println("文件名称:"+text); }else { System.out.println("文件时间:"+text); } } } }

你可能感兴趣的:(解析html,jsoup,java,爬取,java爬取数据)