<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
这是网上的一个题库网站,选择需要抓取题目答案解析等数据,由于需要登录,所以只能网页另存为HTML文本,然后进行解析,例如另存为html.txt
<div class="question-bank">
<dl>
<dt>在一个软件项目的测试阶段,一些功能的执行与客户的需求文件中规定的不同,客户建议在升级版本中纠正,项目经理同意了。在项目开发阶段,项目经理应更加关注哪个领域?</dt>
<dd></dd>
<dd>
<ul id="questionUl1244285044579631104" lang="1244285044579631104#1">
<li lang="A">
<i class="option"></i>
<span>A.版本控制</span></li>
<li lang="B">
<i class="option"></i>
<span>B.客户满意度</span></li>
<li lang="C">
<i class="option type-green">
<i class="am-icon-check"></i>
</i>
<span>C.质量控制</span></li>
<li lang="D">
<i class="option"></i>
<span>D.质量测量指标</span></li>
</ul>
</dd>
</dl>
</div>
<div class="answer answer1244285044579631104">
<div class="left">
<i class="questionStatus1244285044579631104 am-icon-times-circle"></i>
<span>我的答案:
<i class="Danger userAnswer1244285044579631104">未作答</i></span>
<span>参考答案:
<i class="Success answerOption1244285044579631104">C</i></span>
</div>
<div class="right">
<ul>
<li onclick="dialogNote('1244285044579631104')">
<i id="questionNote1244285044579631104" class=" am-icon-pencil-square-o"></i>
<a href="javascript:void(0);">笔记</a></li>
<li>
<i class="am-icon-comments-o"></i>
<a href="https://st.eyescredit.com/comment/list?type=question&otherId=1244285044579631104" target="_blank">评论(0)</a></li>
<li onclick="dialogError('1244285044579631104')">
<i class="am-icon-calendar-times-o"></i>
<a href="javascript:void(0);">纠错</a></li>
</ul>
</div>
</div>
<div class="intro intro1244285044579631104 undis">
<ul>
<li>
<i class="am-icon-bar-chart"></i>
<div class="txt">
<p style="margin-top: 0px;">本题全员共作答
<span>124</span>次,正确率
<span>54.0%</span>:本题我共作答
<span>0</span>次,正确
<span>0</span>次,错误
<span>0</span>次</p></div>
</li>
<li>
<i class="am-icon-file-text-o"></i>
<div class="txt">
<p style="color:#D0021B">
<b>[考点定位]</b>
</p>
<p>/</p>
<p style="color:#D0021B">
<b>[题眼解析]</b>
</p>
<p>题解:只有质量控制可以出现在项目执行阶段。</p>
<p style="color:#D0021B">
<b>[知识拓展]</b>
</p>
<p>无</p>
</div>
</li>
</ul>
</div>
创建一个题目类
class Subject{
//题目
private String title;
//选项A
private String optionA;
//选项B
private String optionB;
//选项C
private String optionC;
//选项D
private String optionD;
//答案
private String answer;
//解析
private String analysis;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getOptionA() {
return optionA;
}
public void setOptionA(String optionA) {
this.optionA = optionA;
}
public String getOptionB() {
return optionB;
}
public void setOptionB(String optionB) {
this.optionB = optionB;
}
public String getOptionC() {
return optionC;
}
public void setOptionC(String optionC) {
this.optionC = optionC;
}
public String getOptionD() {
return optionD;
}
public void setOptionD(String optionD) {
this.optionD = optionD;
}
public String getAnswer() {
return answer;
}
public void setAnswer(String answer) {
this.answer = answer;
}
public String getAnalysis() {
return analysis;
}
public void setAnalysis(String analysis) {
this.analysis = analysis;
}
}
核心代码
public static void main(String[] args) {
//用户存放题目的集合,根据需要可以存库或者别的需求
List<Subject> subjects = new ArrayList<Subject>();
//读取文件
File input = new File("C:/Users/Desktop/html.txt");
try {
//解析整个文件dom
Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
//获取所有question-bank样式的元素
Elements divs = doc.select(".question-bank");
//获取所有Success样式的元素
Elements answers = doc.select(".Success");
//获取所有intro样式的div的子元素ui,然后第二个子元素li的子元素div
Elements analysis = doc.select("div.intro > ul > li:eq(1) > div");
for(int i = 0; i < divs.size(); i ++){
Element div = divs.get(i);
Subject subject = new Subject();
subject.setTitle((i + 1) + "." + div.child(0).child(0).text());
Elements spans = div.getElementsByTag("span");
if(spans.size() == 4){
subject.setOptionA(spans.get(0).text());
subject.setOptionB(spans.get(1).text());
subject.setOptionC(spans.get(2).text());
subject.setOptionD(spans.get(3).text());
}
System.out.println(subject.getTitle());
System.out.println(subject.getOptionA());
System.out.println(subject.getOptionB());
System.out.println(subject.getOptionC());
System.out.println(subject.getOptionD());
subject.setAnswer(answers.get(i).text());
System.out.println("答案:" + answers.get(i).text());
System.out.println("\n");
Element analysi = analysis.get(i);
Elements ps = analysi.getElementsByTag("p");
System.out.println("第" + (i + 1) + "题解析");
if(ps.get(1).text() != null && ps.get(1).text().trim().length() > 0 && !ps.get(1).text().equals("/")){
System.out.println(ps.get(1).text());
subject.setAnalysis(ps.get(1).text());
}
if(ps.get(3).text() != null && ps.get(3).text().trim().length() > 0 && !ps.get(3).text().equals("/")){
System.out.println(ps.get(3).text());
subject.setAnalysis(ps.get(3).text());
}
System.out.println("\n");
subjects.add(subject);
}
} catch (IOException e) {
e.printStackTrace();
}
}
其实jsoup还可以解析xml,不过更多的是用在HTML解析,用于网络爬虫比较多,抓取数据还是很方便的