java jsoup解析HTML

maven

<dependency>
    <!-- jsoup HTML parser library @ https://jsoup.org/ -->
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>

html

这是网上的一个题库网站,选择需要抓取题目答案解析等数据,由于需要登录,所以只能网页另存为HTML文本,然后进行解析,例如另存为html.txt

<div class="question-bank">
  <dl>
    <dt>在一个软件项目的测试阶段,一些功能的执行与客户的需求文件中规定的不同,客户建议在升级版本中纠正,项目经理同意了。在项目开发阶段,项目经理应更加关注哪个领域?</dt>
    <dd></dd>
    <dd>
      <ul id="questionUl1244285044579631104" lang="1244285044579631104#1">
        <li lang="A">
          <i class="option"></i>
          <span>A.版本控制</span></li>
        <li lang="B">
          <i class="option"></i>
          <span>B.客户满意度</span></li>
        <li lang="C">
          <i class="option type-green">
            <i class="am-icon-check"></i>
          </i>
          <span>C.质量控制</span></li>
        <li lang="D">
          <i class="option"></i>
          <span>D.质量测量指标</span></li>
      </ul>
    </dd>
  </dl>
</div>
<div class="answer answer1244285044579631104">
  <div class="left">
    <i class="questionStatus1244285044579631104  am-icon-times-circle"></i>
    <span>我的答案: &nbsp;
      <i class="Danger userAnswer1244285044579631104">未作答</i></span>
    <span>参考答案: &nbsp;
      <i class="Success answerOption1244285044579631104">C</i></span>
  </div>
  <div class="right">
    <ul>
      <li onclick="dialogNote('1244285044579631104')">
        <i id="questionNote1244285044579631104" class=" am-icon-pencil-square-o"></i>
        <a href="javascript:void(0);">笔记</a></li>
      <li>
        <i class="am-icon-comments-o"></i>
        <a href="https://st.eyescredit.com/comment/list?type=question&otherId=1244285044579631104" target="_blank">评论(0)</a></li>
      <li onclick="dialogError('1244285044579631104')">
        <i class="am-icon-calendar-times-o"></i>
        <a href="javascript:void(0);">纠错</a></li>
    </ul>
  </div>
</div>
<div class="intro intro1244285044579631104 undis">
  <ul>
    <li>
      <i class="am-icon-bar-chart"></i>
      <div class="txt">
        <p style="margin-top: 0px;">本题全员共作答
          <span>124</span>次,正确率
          <span>54.0%</span>:本题我共作答
          <span>0</span>次,正确
          <span>0</span>次,错误
          <span>0</span></p></div>
    </li>
    <li>
      <i class="am-icon-file-text-o"></i>
      <div class="txt">
        <p style="color:#D0021B">
          <b>[考点定位]</b>
        </p>
        <p>/</p>
        <p style="color:#D0021B">
          <b>[题眼解析]</b>
        </p>
        <p>题解:只有质量控制可以出现在项目执行阶段。</p>
        <p style="color:#D0021B">
          <b>[知识拓展]</b>
        </p>
        <p></p>
      </div>
    </li>
  </ul>
</div>

java代码

创建一个题目类

class Subject{
    //题目
    private String title;
    //选项A
    private String optionA;
    //选项B
    private String optionB;
    //选项C
    private String optionC;
    //选项D
    private String optionD;
    //答案
    private String answer;
    //解析
    private String analysis;

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getOptionA() {
        return optionA;
    }

    public void setOptionA(String optionA) {
        this.optionA = optionA;
    }

    public String getOptionB() {
        return optionB;
    }

    public void setOptionB(String optionB) {
        this.optionB = optionB;
    }

    public String getOptionC() {
        return optionC;
    }

    public void setOptionC(String optionC) {
        this.optionC = optionC;
    }

    public String getOptionD() {
        return optionD;
    }

    public void setOptionD(String optionD) {
        this.optionD = optionD;
    }

    public String getAnswer() {
        return answer;
    }

    public void setAnswer(String answer) {
        this.answer = answer;
    }

    public String getAnalysis() {
        return analysis;
    }

    public void setAnalysis(String analysis) {
        this.analysis = analysis;
    }
}

核心代码

public static void main(String[] args) {
        //用户存放题目的集合,根据需要可以存库或者别的需求
        List<Subject> subjects = new ArrayList<Subject>();
        //读取文件
        File input = new File("C:/Users/Desktop/html.txt");
        try {
            //解析整个文件dom
            Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
            //获取所有question-bank样式的元素
            Elements divs = doc.select(".question-bank");
            //获取所有Success样式的元素
            Elements answers = doc.select(".Success");
            //获取所有intro样式的div的子元素ui,然后第二个子元素li的子元素div
            Elements analysis = doc.select("div.intro > ul > li:eq(1) > div");
            for(int i = 0; i < divs.size(); i ++){
                Element div = divs.get(i);
                Subject subject = new Subject();
                subject.setTitle((i + 1) + "." + div.child(0).child(0).text());
                Elements spans = div.getElementsByTag("span");
                if(spans.size() == 4){
                    subject.setOptionA(spans.get(0).text());
                    subject.setOptionB(spans.get(1).text());
                    subject.setOptionC(spans.get(2).text());
                    subject.setOptionD(spans.get(3).text());
                }
                System.out.println(subject.getTitle());
                System.out.println(subject.getOptionA());
                System.out.println(subject.getOptionB());
                System.out.println(subject.getOptionC());
                System.out.println(subject.getOptionD());

                subject.setAnswer(answers.get(i).text());
                System.out.println("答案:" + answers.get(i).text());

                System.out.println("\n");
                Element analysi = analysis.get(i);
                Elements ps = analysi.getElementsByTag("p");
                System.out.println("第" + (i + 1) + "题解析");
                if(ps.get(1).text() != null && ps.get(1).text().trim().length() > 0 && !ps.get(1).text().equals("/")){
                    System.out.println(ps.get(1).text());
                    subject.setAnalysis(ps.get(1).text());
                }

                if(ps.get(3).text() != null && ps.get(3).text().trim().length() > 0 && !ps.get(3).text().equals("/")){
                    System.out.println(ps.get(3).text());
                    subject.setAnalysis(ps.get(3).text());
                }
                System.out.println("\n");
                subjects.add(subject);
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

打印出来的效果
java jsoup解析HTML_第1张图片

说明

其实jsoup还可以解析xml,不过更多的是用在HTML解析,用于网络爬虫比较多,抓取数据还是很方便的

你可能感兴趣的:(java,jsoup,爬虫,HTML解析)