Jsoup Select
本位主要讲解Jsoup
包中的 select
方法的使用。
getEntity
方法//get html entity from specific url
public HttpEntity getEntity(String url) {
//01.CloseableHttpClient is a abstract class
CloseableHttpClient httpClient = HttpClients.createDefault();
//use get
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0");
InputStream inputStream = null;
//get the request's response
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
} catch (IOException e) {
e.printStackTrace();
}
HttpEntity entity = response.getEntity();
return entity;
}
getImageUrl()
方法 //get picture url from specific topic
public String getImageUrl(String topicUrl) {
String pictureUrl = "";
HttpEntity entity = getEntity(url);
try {
String content = EntityUtils.toString(entity);
Document document= Jsoup.parse(content);
//Elements ele1 = document.getElementsByClass("RichText ztext CopyrightRichText-richText");
//使用select()查看所有图片链接
//id使用# class使用.
Elements ele3 = document.select("img[src]"); //输出带有src的img标签
for (Element e : ele3) {
System.out.println("内容是:"+e.toString());
}
} catch (IOException e) {
e.printStackTrace();
}
return pictureUrl;
}
执行结果如下:
img/src:<img src="data:image/svg+xml;utf8," data-caption="" data-size="normal" data-rawwidth="1080" data-rawheight="612" class="origin_image zh-lightbox-thumb lazy" width="1080" data-original="https://pic3.zhimg.com/v2-5beda13596dff566781f37abdceeaac6_r.jpg" data-actualsrc="https://pic3.zhimg.com/v2-5beda13596dff566781f37abdceeaac6_b.jpg">
img/src:<img src="https://pic1.zhimg.com/v2-d240f484163a6a278847111b7846431c_b.jpg" data-caption="" data-size="normal" data-rawwidth="684" data-rawheight="201" class="origin_image zh-lightbox-thumb" width="684" data-original="https://pic1.zhimg.com/v2-d240f484163a6a278847111b7846431c_r.jpg">
img/src:<img src="data:image/svg+xml;utf8," data-caption="" data-size="normal" data-rawwidth="684" data-rawheight="201" class="origin_image zh-lightbox-thumb lazy" width="684" data-original="https://pic1.zhimg.com/v2-d240f484163a6a278847111b7846431c_r.jpg" data-actualsrc="https://pic1.zhimg.com/v2-d240f484163a6a278847111b7846431c_b.jpg">
img/src:<img src="https://pic1.zhimg.com/v2-e0af6a02779c61c28dfc2a1fbb635364_b.jpg" data-caption="" data-size="normal" data-rawwidth="983" data-rawheight="188" class="origin_image zh-lightbox-thumb" width="983" data-original="https://pic1.zhimg.com/v2-e0af6a02779c61c28dfc2a1fbb635364_r.jpg">
···
img/src:<img src="data:image/svg+xml;utf8," data-caption="" data-size="normal" data-rawwidth="677" data-rawheight="77" class="origin_image zh-lightbox-thumb lazy" width="677" data-original="https://pic4.zhimg.com/v2-0c1860021e6ec5b83f962da7f82e7e6f_r.jpg" data-actualsrc="https://pic4.zhimg.com/v2-0c1860021e6ec5b83f962da7f82e7e6f_b.jpg">
img/src:<img src="https://pic1.zhimg.com/v2-07a20eb33a0da30e818ed751f63e73ac_b.jpg" data-caption="" data-size="normal" data-rawwidth="642" data-rawheight="129" class="origin_image zh-lightbox-thumb" width="642" data-original="https://pic1.zhimg.com/v2-07a20eb33a0da30e818ed751f63e73ac_r.jpg">
img/src:<img src="data:image/svg+xml;utf8," data-caption="" data-size="normal" data-rawwidth="642" data-rawheight="129" class="origin_image zh-lightbox-thumb lazy" width="642" data-original="https://pic1.zhimg.com/v2-07a20eb33a0da30e818ed751f63e73ac_r.jpg" data-actualsrc="https://pic1.zhimg.com/v2-07a20eb33a0da30e818ed751f63e73ac_b.jpg">
img/src:<img src="https://pic2.zhimg.com/v2-c5a38208c31d8643360a966a40c46745_b.jpg" data-caption="" data-size="normal" data-rawwidth="550" data-rawheight="106" class="origin_image zh-lightbox-thumb" width="550" data-original="https://pic2.zhimg.com/v2-c5a38208c31d8643360a966a40c46745_r.jpg">
···
img/src:<img src="https://pic1.zhimg.com/v2-f7ff27a49b9634353aceb8a092653660_b.jpg" data-caption="" data-size="normal" data-rawwidth="378" data-rawheight="548" class="content_image" width="378">
img/src:<img src="data:image/svg+xml;utf8," data-caption="" data-size="normal" data-rawwidth="378" data-rawheight="548" class="content_image lazy" width="378" data-actualsrc="https://pic1.zhimg.com/v2-f7ff27a49b9634353aceb8a092653660_b.jpg">
img/src:<img src="https://pic3.zhimg.com/v2-54de658c2f0ac298d1afd8818f470bca_b.jpg" data-caption="" data-size="normal" data-rawwidth="382" data-rawheight="524" class="content_image" width="382">
img/src:<img class="LinkCard-image LinkCard-image--square" alt="图标" src="https://pic3.zhimg.com/v2-4ee23300218a52f952eb7d3ae01bcbe2_ipico.jpg">
img/src:<img class="Avatar Avatar--medium Avatar--round" width="40" height="40" src="https://pic3.zhimg.com/v2-2cc46817da1818c3cc055e72d30c34a5_xs.jpg" srcSet="https://pic3.zhimg.com/v2-2cc46817da1818c3cc055e72d30c34a5_l.jpg 2x" alt="ins风格照片">
通过输出结果,我们看到我们输出的是一个标签里面的所有值,如何获取到 img
标签中的src
的值呢?毕竟 jpg
的网址才是我们最需要的(逃~)。修改代码如下:
//get picture url from specific topic
public String getImageUrl(String topicUrl) {
String pictureUrl = "";
//01.通过url获取entity
HttpEntity entity = getEntity(url);
try {
//02.将entity转化为字符串形式的content
String content = EntityUtils.toString(entity);
//03.将字符串形式的content解析成document
Document document= Jsoup.parse(content);
//04.根据已知的document,获取目标Elements 元素
Elements imgElements = document.select("img[src$=.jpg]");//查找扩展名以.jpg 结尾的dom节点
//05.获取Element节点中的指定属性值。在我这里我就需要获取src的值
for (Element e : imgElements) {
pictureUrl = e.attr("src");
System.out.println("src:"+pictureUrl);
// getPicture(pictureUrl);
}
} catch (IOException e) {
e.printStackTrace();
}
return pictureUrl;
}
执行结果如下:
src:https://pic3.zhimg.com/v2-2cc46817da1818c3cc055e72d30c34a5_is.jpg
src:https://pic3.zhimg.com/v2-5e94b6e804a69d28fa669854e3189626_1200x500.jpg
src:https://pic4.zhimg.com/v2-44b4e1c2d603feb84dd480705292624f_xs.jpg
src:https://pic3.zhimg.com/v2-5beda13596dff566781f37abdceeaac6_b.jpg
···
src:https://pic1.zhimg.com/v2-d240f484163a6a278847111b7846431c_b.jpg
src:https://pic1.zhimg.com/v2-e0af6a02779c61c28dfc2a1fbb635364_b.jpg