采集csdn的博客数据

最近做了一个网站www.ifunit.com,自己偷懒,采集了csdn的文章,采集的入口是从csdn搜索进入,以下是主要的采集代码

HttpResponse response = req.getMethod() == Method.GET ? HttpUtil.httpGet(req.getUrl(), req.getHeaderMap())
            : HttpUtil.httpPost(req.getUrl(), req.getFormParam(), req.getHeaderMap());

        String html = HttpUtil.response2Html(response);
        Document doc = Jsoup.parse(html, req.getUrl());

        UrlListResponse rep = new UrlListResponse();

        List<String> list = new ArrayList<String>();

        Elements urlElements = doc.body().select("h3.rt a");//获取url
        for (Element element : urlElements) {
            list.add(element.attr("href"));
        }
        List<String> all = WpJdbc.queryUrls();//查询已经采集的
        list.removeAll(all);
        rep.setList(list);
        return rep;



然后是博客处理,
public BlogResponse parser(Request req) {

        HttpResponse response = req.getMethod() == Method.GET ? HttpUtil.httpGet(req.getUrl(), req.getHeaderMap())
            : HttpUtil.httpPost(req.getUrl(), req.getFormParam(), req.getHeaderMap());

        String html = HttpUtil.response2Html(response);

        Document doc = Jsoup.parse(html, req.getUrl());

        String keywords = "";

        String description = "";

        Elements metas = doc.select("meta");

        if (metas != null && metas.size() > 0) {
            for (Element meta : metas) {
                if ("keywords".equalsIgnoreCase(meta.attr("name"))) {
                    keywords = meta.attr("content");
                } else if ("Description".equalsIgnoreCase(meta.attr("name"))) {
                    description = meta.attr("content");
                }
            }
        }

        Element body = doc.body();
        Elements titleEs = body.select("h1 span.link_title a");
        titleEs.select("font").remove();
        String title = titleEs.text();
        Elements content = body.select(".article_content");

        Elements tags = body.select(".tag2box a");

        Set<String> blogTags = new LinkedHashSet<String>();

        if (tags != null && tags.size() > 0) {
            for (Element tag : tags) {
                blogTags.add(tag.text());
            }
        }

        BlogResponse blog = new BlogResponse();
        blog.setTitle(title);
        blog.setContent(getContent(content, req));
        blog.setKeywords(keywords);
        blog.setDescription(description);
        blog.setTags(blogTags);
        blog.setAuthor("csdn");
        blog.setLink(req.getUrl());
        return blog;
    }

    /**
     * 处理文章内容
     * 
     * @param content
     * @param req
     */
    private String getContent(Elements content, Request req) {
        Elements pres = content.select("pre[name='code']");
        Map<String, String> map = new HashMap<String, String>();
        int index = 1;
        for (Element pre : pres) {
            //pres
            String lang = pre.attr("class");
            if (lang != null) {
                try {
                    String h = pre.html().replace("&nbsp;", " ").replace("&lt;", "<").replace("&gt;", ">")
                        .replace("&amp;", "&").replace("&quot;", "\"");
                    String langHtml = "\n\n[" + lang + "]\n" + h + "\n[/" + lang + "]";
                    String random = "<!-- #" + lang + "*" + index + "*" + lang + "# -->";
                    map.put(random, langHtml);
                    pre.before(random);
                    index++;
                    pre.remove();
                }
                catch (Exception e) {
                    log.warn("获取代码语言出错", e);
                    continue;
                }
            }
        }
        //图片处理
        Elements imgs = content.select("img");
        for (Element img : imgs) {
            String src = img.attr("src");
            if (src != null && !src.toLowerCase().startsWith("javascript:")) {
                String imgSrc = "";
                if (src.startsWith("http")) {
                    imgSrc = src;
                } else {
                    //以下方法对相对路径进行转换  
                    try {
                        URL hostUrl = new URL(req.getUrl());
                        URL imgUrl = new URL(hostUrl, src);
                        imgSrc = imgUrl.toString();
                    }
                    catch (MalformedURLException e) {
                        log.warn("图片url转换异常", e);
                        continue;
                    }
                }
                Request imgReq = new Request(imgSrc);
                try {
                    String newSrc = ImageUtil.downLoad(imgReq);
                    img.attr("src", newSrc);
                    if ("a".equalsIgnoreCase(img.parent().tagName())) {
                        Element a = img.parent();
                        if (src.equals(a.attr("href")) || imgSrc.equals(a.attr("href"))) {
                            a.attr("href", newSrc);
                        }
                    }
                }
                catch (Exception e) {
                    log.warn("下载图片错误", e);
                }
            }
        }
        String html = content.html();
        for (String key : map.keySet()) {
            html = html.replace(key, map.get(key));
        }

        return html;
    }

由于www.ifunit.com采用的是wordpress,直接使用xmlrpc就可以了

你可能感兴趣的:(JSoup,csdn,采集)