Java 爬取微信公众号文章(文字 + 图片)

Maven依赖:



    com.alibaba
    fastjson
    1.2.47



    org.apache.httpcomponents
    httpclient
    4.5.2



    org.jsoup
    jsoup
    1.11.3

线上测试点这里

爬取工具类:

package com.zyq.tools;

import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;

/**
 * 文章爬取工具类
 *
 * @author ZhangYuanqiang
 * @since 2020/01/04
 */
public class SpiderUtil {

    // 微信公众号文章域名
    private static final String WX_DOMAIN = "https://mp.weixin.qq.com";
    // 文章返回前端统一key常量
    private static final String KEY_TITLE = "title"; // 文章标题
    private static final String KEY_COVER_URL = "coverLink"; // 文章封面图链接
    private static final String KEY_REFER_NAME = "referName"; // 文章出处作者
    private static final String KEY_REFER_URL = "referLink"; // 文章出处链接
    private static final String KEY_TAGS = "tags"; // 文章内容
    private static final String KEY_NAME = "name"; // 标签名称
    private static final String KEY_TEXT = "text"; // 文本信息
    private static final String KEY_HREF = "href"; // a标签链接

    /**
     * 测试主方法
     */
    public static void main(String args[]) {
        String url = "https://mp.weixin.qq.com/s/OEjKIxTRFSY5lcNk6YIlUg";
        Resp resp = getActicle(url);
        if (resp.isSuccess()) {
            System.out.println(resp.getBody());
        } else {
            System.out.println(resp.getMsg());
        }
    }
    
    /**
     * 根据文章链接抓取文章内容
     * 
     * @param url 文章链接
     * @return 文章内容
     */
    public static Resp getActicle(String url) {
        // 检测链接是否合法
        String msg = checkUrl(url);
        if (msg != null) {
            return Resp.error(msg);
        }
        // 请求与响应
        String resp = HttpTool.get(url, getWxHeaderMap());
        if (resp == null || resp.trim().length() == 0) {
            return Resp.error("文章获取失败,请检查链接是否正确");
        }
        // 解析
        Resp acticleResp = getWxActicleContent(resp, url);
        if (acticleResp.isError()) {
            return Resp.error(acticleResp.getMsg());
        }
        return acticleResp;
    }    

    /**
     * 检测文章链接是否合法
     */
    public static String checkUrl(String url) {
        if (url == null) {
            return "请输入文章链接";
        }
        if (!url.startsWith(WX_DOMAIN)) {
            return "请输入微信公众号文章链接";
        }
        return null;
    }


    /**
     * 微信公众号请求头设置
     */
    public static Map getWxHeaderMap() {
        Map map = new HashMap<>(new LinkedHashMap<>());
        map.put("Accept", "text/html, application/xhtml+xml, image/jxr, */*");
        map.put("Accept-Encoding", "gzip, deflate");
        map.put("Accept-Language", "zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.5, en; q=0.3");
        map.put("Host", "mp.weixin.qq.com");
        map.put("If-Modified-Since", "Sat, 04 Jan 2020 12:23:43 GMT");
        map.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko");
        return map;
    }

    
    /**
     * 解析微信公众号文章
     * 
     * @param resp 请求文章响应
     * @param url 文章链接
     * @return 文章信息
     */
    public static Resp getWxActicleContent(String resp, String url) {
        try {
            Document document = Jsoup.parse(resp);
            // 文章出处(作者)
            String referName = document.getElementsByClass("profile_nickname").get(0).text();
            // 文章封面图链接
            String coverUrl = document.select("meta[property=\"og:image\"]").get(0).attr("content");
            // 文章标题
            String title = document.getElementById("activity-name").text();
            // 文章内容
            Element content = document.getElementsByClass("rich_media_area_primary_inner").get(0);
            JSONObject json = new JSONObject(new LinkedHashMap<>());
            json.put(KEY_TITLE, title);
            json.put(KEY_COVER_URL, coverUrl);
            json.put(KEY_REFER_NAME, referName);
            json.put(KEY_REFER_URL, url);
            JSONArray tags = new JSONArray();
            Elements sections = content.select("*");
            for (Element element : sections) {
                if (element.children().isEmpty()) {
                    getChildTag(element, tags);
                }
            }
            json.put(KEY_TAGS, tags);
            return Resp.success(json);
        } catch (Exception e) {
            e.printStackTrace();
            return Resp.error("文章解析失败");
        }
    }

    public static void getChildTag(Element element, JSONArray tags) {
        JSONObject tag = new JSONObject(new LinkedHashMap<>());
        String tagName = element.tagName();
        tag.put(KEY_NAME, tagName);
        switch (tagName) {
        case "span": {
            tag.put(KEY_TEXT, element.text());
            tags.add(tag);
            break;
        }
        case "img": {
            Attributes attrs = element.attributes();
            if (attrs != null) {
                for (Attribute attr : attrs) {
                    tag.put(attr.getKey().replace("-", ""), attr.getValue());
                }
            }
            tags.add(tag);
            break;
        }
        case "a": {
            tag.put(KEY_HREF, element.attr("href"));
            tag.put(KEY_TEXT, element.attr("textvalue"));
            tags.add(tag);
            break;
        }
        case "br": {
            tags.add(tag);
            break;
        }
        case "p": {
            tag.put(KEY_TEXT, element.text());
            tags.add(tag);
            break;
        }
        default:
            break;
        }
    } 
    
}

Httpclient工具类:

package com.zyq.tools;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.FileBody;
import org.apache.http.entity.mime.content.StringBody;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;

/**
 * Httpclient工具类
 *
 * @author sunnyzyq
 * @since 2019/04/22
 */
public class HttpTool {

    private static final int BYTE_LEN = 102400; // 100KB
    private static final String CHARSET = "UTF-8";  // 编码格式

    /**
     * get请求
     * @param url 请求地址(get请求时参数自己组装到url上)
     * @return 响应文本
     */
    public static String get(String url) {
        // 请求地址,以及参数设置
        HttpGet get = new HttpGet(url);
        // 执行请求,获取相应
        return getRespString(get);
    }

    /**
     * get请求
     * @param url 请求地址(get请求时参数自己组装到url上)
     * @param headerMap 请求头
     * @return 响应文本
     */
    public static String get(String url, Map headerMap) {
        // 请求地址,以及参数设置
        HttpGet get = new HttpGet(url);
        if (headerMap != null) {
            for (Entry entry : headerMap.entrySet()) {
                get.setHeader(entry.getKey(), entry.getValue());
            }
        }
        // 执行请求,获取相应
        return getRespString(get);
    }

    /**
     * post 请求
     * @param url 请求地址
     * @param params 请求参数
     * @return 响应文本
     */
    public static String post(String url, Map params){
        // 构建post请求
        HttpPost post = new HttpPost(url);
        // 构建请求参数
        List pairs = new ArrayList();
        if (params != null) {
            for (Entry entry : params.entrySet()) {
                pairs.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
            }
        }
        HttpEntity entity = null;
        try {
            entity = new UrlEncodedFormEntity(pairs, CHARSET);
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        post.setEntity(entity);
        // 执行情趣,获取相应
        return getRespString(post);
    }

    /**
     * 文件上传
     * @param url 请求地址
     * @param params 请求参数 (文件类型须为File)
     * @return 响应文本
     */
    public static String postFile(String url, Map params) {
        HttpPost post = new HttpPost(url);
        MultipartEntityBuilder builder = MultipartEntityBuilder.create();
        if (params != null) {
            for (String key : params.keySet()) {
                Object value = params.get(key);
                if (value == null) {
                    builder.addPart(key, new StringBody("",ContentType.TEXT_PLAIN));
                    continue;
                }
                if (value instanceof File) {
                    builder.addPart(key, new FileBody((File) value));
                } else {
                    builder.addPart(key, new StringBody(value.toString(), ContentType.TEXT_PLAIN));
                }
            }
        }
        HttpEntity entity = builder.build();
        post.setEntity(entity);
        return getRespString(post);
    }

    /**
     * 文件下载
     */
    public static void getFile(String url, String name) {
        // 图片地址
        HttpGet get = new HttpGet(url);
        // 执行请求,获取响应流
        InputStream in = getRespInputStream(get);
        // InputStream 转 File,保存在当前工程中
        File file = new File(name);
        try {
            FileOutputStream fos = new FileOutputStream(file);
            byte b[] = new byte[BYTE_LEN];
            int j = 0;
            while( (j = in.read(b)) != -1){
                fos.write(b, 0, j);
            }
            fos.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 获取响应信息(String)
     */
    public static String getRespString(HttpUriRequest request) {
        // 获取响应流
        InputStream in = getRespInputStream(request);

        StringBuilder sb = new StringBuilder();
        String line;

        BufferedReader br = new BufferedReader(new InputStreamReader(in));
        try {
            while ((line = br.readLine()) != null) {
                sb.append(line);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        String str = sb.toString();
        return str;
    }

    /**
     * 获取响应信息(InputStream)
     */
    public static InputStream getRespInputStream(HttpUriRequest request) {
        // 获取响应对象
        HttpResponse response = null;
        try {
            response = HttpClients.createDefault().execute(request);
        } catch (Exception e) {
            e.printStackTrace();
        }
        if (response == null) {
            return null;
        }
        // 获取Entity对象
        HttpEntity entity = response.getEntity();
        // 获取响应信息流
        InputStream in = null;
        if (entity != null) {
            try {
                in =  entity.getContent();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return in;
    }
}

响应工具类:

package com.zyq.tools;

/**
 * 响应实体
 * @author sunnyzyq
 * @since 2019/04/23
 */
public class Resp {

    public static final int SUCCESS = 0;
    public static final int ERROR = 1;

    int code = SUCCESS;
    String msg;
    T body;

    public Resp() {}

    public Resp(T t) {
        this.body = t;
    }

    public Resp(int code, String msg, T body) {
        this.code = code;
        this.msg = msg;
        this.body = body;
    }

    public static  Resp error() {
        return new Resp<>(ERROR, null, null);
    }

    public static  Resp error(String msg) {
        return new Resp<>(ERROR, msg, null);
    }

    public static  Resp error(String msg, T body) {
        return new Resp<>(ERROR, msg, body);
    }

    public static  Resp success() {
        return new Resp<>(SUCCESS, null, null);
    }

    public static  Resp success(String msg) {
        return new Resp<>(SUCCESS, msg, null);
    }

    public static  Resp success(T body) {
        return new Resp<>(SUCCESS, "", body);
    }

    public static  Resp success(String msg, T body) {
        return new Resp<>(SUCCESS, msg, body);
    }

    public int getCode() {
        return code;
    }

    public void setCode(int code) {
        this.code = code;
    }

    public String getMsg() {
        return msg;
    }

    public void setMsg(String msg) {
        this.msg = msg;
    }

    public void setBody(T body) {
        this.body = body;
    }

    public T getBody() {
        return body;
    }

    public boolean isError() {
        return code != SUCCESS;
    }

    public boolean isSuccess() {
        return code == SUCCESS;
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append("}");
        sb.append("code:").append(code).append(",");
        if (msg != null) {
            sb.append("msg:").append(msg).append(",");
        }
        if (body != null) {
            sb.append("body:").append(body.toString());
        }
        sb.append("}");
        return sb.toString();
    }
}

前端代码(参考):

HTML部分:


JS部分:

    $("#zhuquBtn").click(function(){
        $.post("/zhua", {url:$("#myurl").val()}, function(rs){
            if(rs.code == 0){
                $("#mybox").empty();
                 var body = rs.body;
                 var title = '

' + body.title + '

'; var refer = '文章出处: '; $("#mybox").append(title); $("#mybox").append(refer); $("#mybox").append('
'); var tags = body.tags; for(var i = 0; i < tags.length; i++) { var tag = tags[i]; var name = tag.name; if (name=='img') { if(typeof(tag.style) == "undefined") { $("#mybox").append('
'); } else { $("#mybox").append('
'); } } else if (name=='span' || name == 'p') { $("#mybox").append('
' + tag.text+'
'); } else if (name=='br') { $("#mybox").append('
'); } else if (name=='a') { $("#mybox").append(''); } } } else { layer.msg(rs.msg, {icon: 2}); } }); });

注意:在html页面加上这个meta可以解除微信图片防盗机制。

Java 爬取微信公众号文章(文字 + 图片)_第1张图片

你可能感兴趣的:(开发工具,Java工具类)