Jsoup抓取图片

准备

选取适合的网站,使用Jsoup获取网页DOM元素。
目标网站:http://www.win4000.com/zt/meinv.html
抓取美女图
使用SpringBoot新建一个工程。添加依赖

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
   

org.springframework.boot
spring-boot-starter-web



org.projectlombok
lombok
true


org.springframework.boot
spring-boot-starter-test
test


org.jsoup
jsoup
1.11.3


org.apache.httpcomponents
httpclient
4.5.7



打开网页

检查元素查看网页的结构,查找图片所在的DOM元素节点。

a 标签的所在的DOM节点为

.Left_bar .tab_tj .tab_box ul li a

使用Jsoup解析网页,

  • 第一步: 获取当前页所有图集的href值
  • 第二步:获取图集中的img src属性值
  • 第三步:使用工具类下载图片

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import com.felix.project.util.FileUtils;
import com.felix.project.util.HttpClientUtils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;

import java.io.IOException;

@Slf4j
@Component
public class CrawlPage {
private static final String HTTPURL = "http://www.win4000.com/zt/meinv_1.html";
private static final String FILEPATH="E:/jsoup/images/";

public static void crawlImg() throws IOException {
// 统计当前页抓取的图片
int count=0;
log.info("开始抓取 --> 目标地址:"+HTTPURL);
String html = HttpClientUtils.getHtml(HTTPURL, "utf-8");
//解析网页
Document doc = Jsoup.parse(html);
//获取 a 标签的所有节点
Elements aImgElement = doc.select(".Left_bar .tab_tj .tab_box ul li a");
//第一步
//循环遍历出a 标签中的 href 属性值
for (Element a : aImgElement) {
//获取a 标签的属性值
String href = a.select("a[href]").attr("href");
//获取到的href值是这样的
// http://www.win4000.com/wallpaper_detail_157015.html
//获取循环索引值
int index=aImgElement.indexOf(a)+1;

//拆分href获得 http://www.win4000.com/wallpaper_detail_157015
String url = href.split(".html")[0];
//获取下一张图片的结构是这样的
// http://www.win4000.com/wallpaper_detail_157015_2.html
log.info("获取第"+index+"个图集");
// 获取该图集共有多少页
int page = Integer.parseInt(Jsoup.parse(HttpClientUtils.getHtml(href, "utf-8")).select(".Bigimg .ptitle em").text());
int total=0;
//第二步
for (int i = 1; i <=page ; i++) {
//根据网页地址发现规律 下一张的图片拼接为
String imgHtml = HttpClientUtils.getHtml(url+"_"+i+".html", "utf-8");
//解析图片页网页
Document imgDoc = Jsoup.parse(imgHtml);
Elements select = imgDoc.select(".pic_main .col-main .main-wrap .pic-meinv a");
//获取img src属性值
// http://pic1.win4000.com/wallpaper/2019-03-29/5c9d7bba69328.jpg
String src = select.select("img").attr("src");
//获取img title属性值
String title = select.select("img").attr("title");
//下载图片工具
FileUtils.downloadImage(src,FILEPATH+title,System.currentTimeMillis()+".jpg");
log.info("下载第--"+index+"--"+title+"--个图集的第"+i+"张图片");
count++;
total++;
}
log.info("第-"+index +"-个图集共有"+total+"张图片");
}
log.info("本次获取"+count+"图片");
}
}

工具类

FileUtils 图片下载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

public class FileUtils {
/**
*
* @param url 图片地址
* @param filePath 保存路径
* @param fileName 保存图片名称
*/
public static void downloadImage(String url,String filePath ,String fileName){

try {
URL u = new URL(url);
URLConnection connection = u.openConnection();
connection.setReadTimeout(60000);
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
InputStream inputStream = connection.getInputStream();

File dir = new File(filePath);
if (!dir.exists()) {
dir.mkdirs();
}

File file = new File(filePath + File.separator + fileName);

BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));

// 构建缓冲区
byte[] buf = new byte[1024];


int size;
// 写入到文件
while (-1 != (size = inputStream.read(buf))) {
out.write(buf, 0, size);
}

out.close();
inputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

HttpClientUtils 获取网页

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpClientUtils {

public static String getHtml(String url,String charset) throws IOException{
CloseableHttpClient client = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");

CloseableHttpResponse response = client.execute(httpGet);
//返回实体
HttpEntity entity = response.getEntity();

String html = EntityUtils.toString(entity, charset);

response.close();
client.close();
return html;
}
}

参考

https://www.open-open.com/jsoup/example-list-links.htm

你可能感兴趣的:(Jsoup抓取图片)