网络爬虫是大数据时代收集数据的一种有效手段,合理合法的运用技术手段获取网络数据,实现数据的再利用也是程序员的一项重要技能。本节我们使用java环境下的jsoup实现网络数据的爬取,主要是图片数据的异步爬取,并实现网络图片的下载及图片的预览功能,预览功能使用具有丰富功能的v-viewer实现。
- 引入爬虫pom工具包
org.apache.httpcomponents httpclient 4.5.6 org.jsoup jsoup 1.14.2
- 爬虫案例后端控制层controller
package com.yundi.atp.platform.module.test.controller; import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper; import com.baomidou.mybatisplus.extension.plugins.pagination.Page; import com.yundi.atp.platform.common.Result; import com.yundi.atp.platform.module.test.entity.SpiderData; import com.yundi.atp.platform.module.test.service.SpiderDataService; import io.swagger.annotations.Api; import org.apache.commons.lang3.StringUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.*; import sun.misc.BASE64Encoder; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ThreadPoolExecutor; /** *
* 前端控制器 *
* * @author yanp * @since 2021-09-10 */ @Api(tags = {"爬虫抓取案例API"}) @RestController @RequestMapping("/test/spiderData") public class SpiderDataController { @Autowired private SpiderDataService spiderDataService; @Autowired private ThreadPoolExecutor threadPoolExecutor; @GetMapping(value = "startSpiderData") public Result startSpiderData() { CompletableFuture.runAsync(() -> { spiderDataService.startSpiderData(); }, threadPoolExecutor); return Result.success(); } @PostMapping(value = "listPage") public Result listPage(@RequestBody SpiderData spiderData) { Page page = spiderDataService.page(spiderData.getPage(), new QueryWrapper().like(StringUtils.isNotBlank(spiderData.getSrcImageName()), "src_image_name", spiderData.getSrcImageName())); return Result.success(page); } @GetMapping(value = "download/{id}") public Result download(@PathVariable(value = "id") String id) { spiderDataService.download(id); return Result.success(); } @GetMapping(value = "batchDownload") public Result batchDownload() { spiderDataService.batchDownload(); return Result.success(); } @GetMapping(value = "preview/{id}") public Result preview(@PathVariable(value = "id") String id) throws IOException { SpiderData spiderData = spiderDataService.getById(id); File file = new File(spiderData.getStoreAddress()); FileInputStream fileInputStream = new FileInputStream(file); int size = fileInputStream.available(); byte[] bytes = new byte[size]; fileInputStream.read(bytes); fileInputStream.close(); BASE64Encoder encoder = new BASE64Encoder(); return Result.success(encoder.encode(bytes)); } }
- 爬虫案例后端业务层
package com.yundi.atp.platform.module.test.service.impl; import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; import com.yundi.atp.platform.module.test.entity.SpiderData; import com.yundi.atp.platform.module.test.mapper.SpiderDataMapper; import com.yundi.atp.platform.module.test.service.SpiderDataService; import com.yundi.atp.platform.spider.MzituImageSpider; import lombok.extern.slf4j.Slf4j; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.stereotype.Service; import java.io.IOException; import java.util.List; /** *
* 服务实现类 *
* * @author yanp * @since 2021-09-10 */ @Slf4j @Service public class SpiderDataServiceImpl extends ServiceImplimplements SpiderDataService { @Override public void startSpiderData() { try { //1.创建连接 String url = "https://www.mzitu.com"; Connection conn = Jsoup.connect(url).timeout(50000); conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"); conn.header("referer", url); //2.获取栏目图片连接 Document document = conn.get(); Element element = document.getElementById("menu-nav"); Elements elements = element.getElementsByTag("a"); List linkList = elements.eachAttr("abs:href"); linkList.remove(0); linkList.forEach(it -> { Connection connect = Jsoup.connect(it).timeout(50000); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"); connect.header("referer", it); try { Document doc = connect.get(); //3.获取栏目图片最大分页页数 Elements docElements = doc.getElementsByClass("nav-links"); Element docElement = docElements.get(0); Elements tag = docElement.getElementsByTag("a"); List tagValues = tag.eachText(); tagValues.remove(tagValues.size() - 1); Integer maxPage = Integer.parseInt(tagValues.get(tagValues.size() - 1)); log.info("--------------------开始解析下载图片---------------------------"); String imageUrl = it.substring(0, it.length() - 1); String imageCategory = imageUrl.substring(imageUrl.lastIndexOf("/") + 1); //4.开始爬取图片 for (int i = 1; i <= maxPage; i++) { String imgUrl = it + "page/" + i + "/"; Connection imgConn = Jsoup.connect(imgUrl).timeout(50000); imgConn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"); imgConn.header("referer", imgUrl); try { Document imgDocument = imgConn.get(); Element pins = imgDocument.getElementById("pins"); Elements imgTag = pins.getElementsByTag("img"); for (Element ele : imgTag) { SpiderData spiderData = new SpiderData(); spiderData.setSrcLink(imgUrl); spiderData.setSrcImageName(ele.attr("alt")); spiderData.setSrcImageAddress(ele.attr("data-original")); spiderData.setSrcImageCategory(imageCategory); this.save(spiderData); } } catch (IOException ioException) { log.error("获取资源失败:" + ioException); } Thread.sleep(5000); } } catch (IOException | InterruptedException ioException) { log.error("获取资源失败:" + ioException); } }); } catch (IOException ioException) { log.error("获取资源失败:" + ioException); } } @Override public void download(String id) { SpiderData spiderData = this.getById(id); String path = MzituImageSpider.downloadImage(spiderData.getSrcImageAddress()); if (path != null) { spiderData.setStoreAddress(path); spiderData.setStatus(true); this.updateById(spiderData); } } @Override public void batchDownload() { List list = this.list(new QueryWrapper ().eq("status", 0)); for (SpiderData spiderData : list) { String path = MzituImageSpider.downloadImage(spiderData.getSrcImageAddress()); if (path != null) { spiderData.setStoreAddress(path); spiderData.setStatus(true); this.updateById(spiderData); } } } }
- 爬虫案例前端安装v-viewer预览组件
命令:npm i v-viewer -s
- 爬虫案例前端main.js中引入v-viewer
import Vue from 'vue'; import ElementUI from 'element-ui'; import 'element-ui/lib/theme-chalk/index.css'; import router from '@/router'; import {http} from '@/axios/index'; import qs from 'qs'; import '@/util/derective' import App from '@/App.vue'; import Print from 'vue-print-nb' import VideoPlayer from 'vue-video-player' require('video.js/dist/video-js.css') require('vue-video-player/src/custom-theme.css') import VueQuillEditor from 'vue-quill-editor' // require styles import 'quill/dist/quill.core.css' import 'quill/dist/quill.snow.css' import 'quill/dist/quill.bubble.css' import mavonEditor from 'mavon-editor' import 'mavon-editor/dist/css/index.css' import TinymceVueH from 'tinymce-vue-h' import Viewer from 'v-viewer' import 'viewerjs/dist/viewer.css' Vue.use(Viewer); Viewer.setDefaults({ Options: { "inline": true, "button": true, "navbar": true, "title": true, "toolbar": true, "tooltip": true, "movable": true, "zoomable": true, "rotatable": true, "scalable": true, "transition": true, "fullscreen": true, "keyboard": true, "url": "data-source" } }); Vue.use(TinymceVueH); // use Vue.use(mavonEditor); Vue.use(VueQuillEditor, /* { default global options } */); Vue.use(VideoPlayer); Vue.use(Print); Vue.use(ElementUI); Vue.prototype.$http = http; Vue.prototype.$qs = qs; Vue.config.productionTip = false; new Vue({ router, render: h => h(App), }).$mount('#app')
- 爬虫案例前端Spider.vue
爬虫案例(以https://www.mzitu.com网站为例)退出 Test Staring 启动爬虫 查询 批量下载 已下载 未下载 下载 预览 ![]()
- 验证效果
关于java环境下的jsoup实现网络数据图片的爬取及网络图片的下载和图片的预览功到这里就结束了,下期见。。。