搜狗微信公众号抓取

由于搜狗的反爬原因,不能直接对搜狗上面检索结果进行抓取,首先应先对操作进行抓包分析:
一、抓包分析:
1、搜狗微信公众号检索界面:
搜狗微信公众号抓取_第1张图片

2、抓包获取对应的HTTP Referer:
搜狗微信公众号抓取_第2张图片

HTTP Referer是header的一部分,当浏览器向web服务器发送请求的时候,一般会带上Referer,告诉服务器我是从哪个页面链接过来的,服务器藉此可以获得一些信息用于处理。比如从我主页上链接到一个朋友那里,他的服务器就能够从HTTP Referer中统计出每天有多少用户点击我主页上的链接访问他的网站。

3、分析获取到的可访问地址:

http://weixin.sogou.com/weixin?type=1
&sug_type=
&query=%E6%B1%BD%E8%BD%A6+%E4%BA%91%E5%8D%97
&ie=utf8
&sug=y
&w=01019900
&sut=2257
&sst0=1475908133010 //当前时间戳(ms)
&lkt=0%2C0%2C0

二、数据抓取:
通过referer来请求,获取请求数据(对方估计会根据频率对本地模拟请求有所限制,如被限制,隔一段时间就会恢复)

请求地址参数:

    private static final String OTHER_PARAMS = "&_sug_=y&w=01019900&sut=1790&lkt=0%2C0%2C0";
    private static final String BASE_URL = "http://weixin.sogou.com/weixin?type=1&ie=utf8&_sug_type_=" + OTHER_PARAMS;

    private static void start() {
        String url = null;
        String[] keyWords = {"汽车", "二手车", "车"};
        System.out.println("--- start ---");
        for (int i = 0; i < keyWords.length; i++) {
            String keyWord = keyWords[i];
            List citys = getCity();
            for (int j = 0; j < citys.size(); j++) {
                String city = citys.get(j);
                String url_suffix = null;
                try {
                    url_suffix = java.net.URLEncoder.encode(keyWord,"utf-8") + "+" + java.net.URLEncoder.encode(city,"utf-8");
                    url = BASE_URL + "&query=" + url_suffix + "&sst0=" + System.currentTimeMillis();
                    System.out.println("fetch : " + url);
                    fetch(url);
                } catch (UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static List getCity() {
        String[] citys = new String[]{"贵州","甘肃","广西","浙江","福建","安徽","香港","广东","海南","河北","河南","黑龙江","重庆","辽宁","湖北","湖南","吉林","江苏","江西","天津",
                "内蒙古","四川","宁夏","青海","山东","山西","陕西","上海","西藏","北京","新疆","云南"};
        List list = Arrays.asList(citys);
        return list;
    }

数据抓取并存储:

        private static void fetch(String url) {
        Document doc = ToolkitForSpider.getHtmlDoc(url);
        Elements eles = doc.select("div div.txt-box");
        for (int i = 0; i < eles.size(); i++) {
            String name = eles.get(i).select("h3").text();
            String code = eles.get(i).select(" h4 span label").text();
            Elements urlEles = doc.select("div.wx-rb.bg-blue.wx-rb_v1._item");
            String wechat_url = urlEles.get(i).attr("href");
            System.out.println(name + " " + code + " " + wechat_url);
            insert(name, code, wechat_url);
        }

        Elements pageEles = doc.select("div.p a");
        int totalPage = pageEles.size();
        for (int i = 0; i < totalPage - 1; i++) {
            String pageUrl = pageEles.get(i).attr("href");
            pageUrl = "http://weixin.sogou.com/weixin" + pageUrl;
            System.out.println("fetch : " + pageUrl);

            Document pageDoc = ToolkitForSpider.getHtmlDoc(pageUrl);
            Elements pageUrlEles = pageDoc.select("div div.txt-box");
            for (int j = 0; j < pageUrlEles.size(); j++) {
                String name = pageUrlEles.get(j).select("h3").text();
                String code = pageUrlEles.get(j).select(" h4 span label").text();
                Elements urlEles = doc.select("div.wx-rb.bg-blue.wx-rb_v1._item");
                String wechat_url = urlEles.get(j).attr("href");
                System.out.println(name + " " + code);
                insert(name, code, wechat_url);
            }
        }
    }

    private static void insert(String name, String code, String url) {
        Connection conn = ToolkitForSpider.getMySqlConnection();
        String sql = "insert into inf_weixin(wx_name, wx_code, biz_url, last_update, last_update_title, newrank_index, update_time) "
                + "values(?,?,?,?,?,?,now()) on duplicate key update update_time = now()";
        Object[] values = new Object[]{name, code, url, "1970-01-01", "-1", "-1"};
        try {
            DBLib.update(conn, sql, values);
        } catch (SQLException e) {
            e.printStackTrace();
        } finally {
            ToolkitForSpider.close(conn);
        }
    }

你可能感兴趣的:(网络爬虫,Java)