Java网页爬虫:Spring Boot通过webmagic实现网页爬虫

一、需求

因为业务需求,需要实现新闻资讯功能。初步方案通过第三方提供的服务接口来实现此功能。由于谈判失败,因此决定自开发一套爬虫接口。因此通过查询相关文档,决定采用webmagic开源框架实现自己的爬虫功能。

二、实施过程

1、引入依赖

在pom文件中添加依赖:



	us.codecraft
	webmagic-core
	0.7.3


	us.codecraft
	webmagic-extension
	0.7.3

2、创建相关接口

创建实现类,代码如下(仅供参考):

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import team.biteeny.admin.db.write.cache.ConfigMapper;
import team.biteeny.admin.db.write.mapper.CrawlMapper;
import team.biteeny.admin.db.write.model.CrawlModel;
import team.biteeny.push.getui.PushApp;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Component
public class HuobiInfoProcessor implements PageProcessor {

    @Autowired
    private CrawlMapper crawlMapper;

    @Autowired
    private ConfigMapper configMapper;

    private Site site;

    private static Map map = new ConcurrentHashMap();

    @Override
    public void process(Page page) {
        if (page.getUrl().toString().contains("flash")){
            insertFlash(page);
        }
        if (page.getUrl().toString().contains("article")){
            List urlList = new ArrayList<>();
            Json json = page.getJson();
            JSONObject jsonObject = JSONObject.parseObject(json.toString());
            JSONArray jsonArray = jsonObject.getJSONObject("data").getJSONArray("data");
            for (Object o : jsonArray) {
                JSONObject object = JSONObject.parseObject(JSONObject.toJSONString(o));
                String key = "baseDetail_" + object.getString("id");
                urlList.add("https://www.huobiinfo.com/news/"+key);
                map.put(key + "_listPicturePath",object.getString("listPicturePath"));
                map.put(key + "_title",object.getString("title"));
            }
            page.addTargetRequests(urlList);
        }
        if (page.getUrl().toString().contains("news/baseDetail_")){
            insertNews(page);
        }
    }

    @Override
    public Site getSite() {
        if (site==null){
            site= Site.me().setDomain("www.huobiinfo.com")
                    .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36")
                    .setCharset("UTF-8")
                    .setSleepTime(500);
        }
        return site;
    }

    public static void main(String[] args) {
//        Spider.create(new HuobiInfoProcessor()).addUrl("https://www.huobiinfo.com/flash/").runAsync();
//        Request request = new Request("https://huobi-news-app-gateway-outer.huobi.cn:8005/article/listPagedArticleListByParam");
//        request.setMethod(HttpConstant.Method.POST);
//        request.setRequestBody(HttpRequestBody.json("{\"pageSize\":10,\"pageNum\":1,\"categoryPcId\":15}","utf-8"));
//        Spider.create(new HuobiInfoProcessor()).addRequest(request).runAsync();
//        String title = "BTC链上基础指标略有回暖,链上场内场外交易均较活跃";
//        String c = "根据Searchain.io数据分析:昨日BTC从4100下降到3900点。从链上指标来看,昨日反映BTC内部价值的基础指标整体有所上升,新增地址上升14.89%,活跃地址上升12.20%。从链上交易指标来看,交易用户的活跃度也在上升,交易所流入增加49.16%,流出增加40.78%;链上大额转账的活跃程度集中在100-600 BTC区间,600+ BTC的转账有所下降,大额流入交易所占比有所上升,场内场外均比较活跃。综合链上基础和交易指标来看,近期BTC内部价值略有回暖,链上场内场外交易均活跃。独立分析师Edward对近期BTC市场呈较为悲观状态。\n" +
//                "只有币名和百分比,没有价格波动词,所以不符合推送条件";
//        boolean b = checkPush(title+c);
//        System.out.println(b);

    }

    private void insertFlash(Page page){
        Elements elements = page.getHtml().getDocument().getElementsByClass("item-flash");
        for (Element element : elements) {
            Html html = new Html(element.toString());
            String s = html.xpath("//div[@class='item-flash']//h3[@class='med']//nuxt-link/@to").toString();
            String key = s.substring(1, s.lastIndexOf("/")).replace("/", "_");
            if (crawlMapper.checkExist(key) <= 0){
                String title = html.xpath("//div[@class='item-flash']//h3[@class='med']//nuxt-link/text()").toString();
                String content = html.xpath("//div[@class='item-flash']//div[@class='content']/text()").toString();
                CrawlModel model = new CrawlModel();
                boolean b = checkPush(title + content);
                model.setId(key);
                model.setBody(content);
                model.setTitle(title);
                model.setSource("HuobiInfo");
                model.setType("flash");
                if (b){
                    model.setIs_push(true);
                    push(title,content);
                }else {
                    model.setIs_push(false);
                }
                model.setCreate_time(new Date());
                crawlMapper.crawlInsert(model);
            }
        }
    }
    private void insertNews(Page page){
        String path = page.getUrl().toString();
        String key = path.substring(path.lastIndexOf("/") + 1);
        if (crawlMapper.checkExist(key) <= 0) {
            String source = "

来源:" + page.getHtml().xpath("//div[@class='detail-platform-msg']//p[@class='detail-platform']/text()").toString()+"

"; String notice = "
" + page.getHtml().xpath("//div[@class='detail-source']/text()") + "
"; String article = page.getHtml().xpath("//div[@class='detail-content article-content hb-article']").toString(); String content = source + notice + article; if (!checkDomain(article)){ CrawlModel model = new CrawlModel(); model.setId(key); model.setTitle((String) map.get(key + "_title")); model.setBody(content); model.setList_picture((String) map.get(key + "_listPicturePath")); model.setSource("HuobiInfo"); model.setType("news"); model.setCreate_time(new Date()); crawlMapper.crawlInsert(model); } } } private static boolean checkPush(String str){ if (str == null){ return false; } String regex = "btc|eth|bch|ltc|etc|eos|xrp|dash|trx"; String regex1 = "涨|跌|涨幅|跌幅|上涨|下跌"; String regexF = "大额转账|净流入|净流出"; String regexH = "okex|火币|币安|比特大陆"; String regex2 = "\\d+(\\.?\\d*?)(?=%)"; Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); Pattern p1 = Pattern.compile(regex1); Pattern pf = Pattern.compile(regexF); Pattern ph = Pattern.compile(regexH); Pattern p2 = Pattern.compile(regex2); Matcher matcher = p.matcher(str); Matcher matcher1 = p1.matcher(str); Matcher matcherF = pf.matcher(str); Matcher matcherH = ph.matcher(str); Matcher matcher2 = p2.matcher(str); if (matcher.find() && matcherF.find()){ return true; } if (matcherH.find()){ return true; } if (matcher.find() && matcher1.find()){ while (matcher2.find()){ Double d = Double.valueOf(matcher2.group()); if (d > 5){ return true; } } return false; } return false; } private void push(String title,String text){ // 推送相关 // int hour = Calendar.getInstance().get(Calendar.HOUR); // if (hour >= 8 && hour <= 22){ // } } private boolean checkDomain(String content){ if (content == null){ return false; } String pattern = "mmbiz\\.qpic\\.cn"; Pattern p = Pattern.compile(pattern); Matcher m = p.matcher(content); if(m.find()){ return true; } return false; }

以上是一个简单的实例,包括一些过滤逻辑和数据持久化逻辑等。

三、总结

以上是java通过webmagic实现网页爬虫的简单例子,对webmagic的应用也不是很到位,只是已经满足了当前的业务需求。也未作深入研究。对webmagic也没有详细描述,有兴趣的童鞋们可以自行查阅相关文档学习,欢迎大家加入进来一起讨论学习。

能力有限,难免有不当之处,欢迎大家批评指正。把技术死磕到底!

你可能感兴趣的:(Java,网页爬虫,技术集成)