新浪新闻爬虫程序

下面是该爬虫的关键代码,查看更多的源代码请点击这里

package com.lxf.crawler;
import java.io.File;
import java.io.FileWriter;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import com.lxf.dao.bean.NewsBean;
import com.lxf.dao.imp.NewsDao;
import com.lxf.dao.inf.NewsDaoInf;
/**
 * <爬虫程序> 从新浪新闻中爬取新闻分类、标题及内容 (需导入htmlparser.jar包,可以从我的github上下载https://github.com/lxf44944/sinaNews_crawler/)
 * 
 * @author 刘向峰
 */
public class SinaNews {
    /**
     * 测试入口
     * 
     * @param args
     */
    public static void main(String args[]) {
        // china world
        // society media opinion
        String type = "world";
        File file = new File(type);
        if (!file.exists())// 如果不存在就创建
        {
            file.mkdirs();
        }
        SinaNews gn = new SinaNews();
        String a = gn.getNews(type);
        // gn.writefile(a, "SinaNews.html",type);
    }
    // 抓取信息 组成良好格式
    public String getNews(String type) {
        NewsDaoInf dao = new NewsDao();
        try {
            NodeFilter filter = new TagNameFilter("ul");
            Parser parser = new Parser();
            Parser bodyparser = new Parser();
            parser.setURL("http://news.sina.com.cn/" + type + "/");// 互联网模块的地址
            // System.out.println(parser.getEncoding());
            parser.setEncoding("gb2312");
            NodeList list = parser.extractAllNodesThatMatch(filter);
            StringBuilder newsStr = new StringBuilder(
                    "");// 新闻表格字符串
            SinaNews gn = new SinaNews();
            for (int i = 0; i < list.size() - 1; i++) {
                Tag node = (Tag) list.elementAt(i);
                for (int j = 1; j < node.getChildren().size(); j++) {
                    String textstr = node.getChildren().elementAt(j).toHtml()
                            .trim();
                    if (textstr.length() > 0) {
                        int linkbegin = textstr.indexOf("href=");// 截取链接字符串起始位置int linkend = textstr.indexOf("\">");// 截取链接字符串结束位置
                        String sublink = textstr.substring(linkbegin + 6,
                                linkend);
                        // 链接字符串
                        String link = "";
                        if (sublink.indexOf("target") != -1) {
                            link = sublink.substring(0, sublink.indexOf("\""));
                        } else {
                            link = sublink;// 链接字符串
                        }
                        int titlebegin = textstr.indexOf("\">");
                        int titleend = textstr.indexOf("");
                        String title = textstr.substring(titlebegin + 2,
                                titleend).trim();
                        System.out.println("正在抓取: " + title);
                        // 通过标题判断该新闻是否已经存在if (dao.hasNews(title)) {
                            System.out.println("【该记录已经存在】");
                            continue;
                        }
                        if (title.contains("视频:") || title.contains("视频:")) {
                            System.out.println("【无法获得视频新闻】");
                            continue;
                        }
                        if (title.contains("(图)")) {
                            title = title.replace("(图)", "");
                        }
                        try {
                            /** 新闻内容处理开始 */
                            NodeFilter bodyfilter = new AndFilter(
                                    new TagNameFilter("div"),
                                    new HasAttributeFilter("id", "artibody"));
                            bodyparser.setURL(link);// 地址url// bodyparser.setEncoding(bodyparser.getEncoding());
                            bodyparser.setEncoding("gb2312");
                            NodeList bodylist = bodyparser
                                    .extractAllNodesThatMatch(bodyfilter);
                            // 新闻内容字符串if (bodylist.elementAt(0) == null) {
                                System.out.println("【新闻无内容】");
                                continue;
                            }
                            String newstextstr = bodylist.elementAt(0).toHtml()
                                    .trim();
                            // 只保留正文内容,保留P标签以保持其排版int bodybegin = newstextstr.indexOf("

"); int bodyend = newstextstr.lastIndexOf("

"
) + 4; int bodyimgbegin = newstextstr .indexOf("
"); int bodyimgend = newstextstr .lastIndexOf(""); String body = ""; if (bodybegin < 0) { body = newstextstr; } else { body = newstextstr .substring(bodybegin, bodyend); } if (bodyimgbegin >= 0) { body = newstextstr.substring(bodyimgbegin, bodyimgend) + "
"
+ body; } /** 写入数据库 */ NewsBean newsBean = new NewsBean(0, title, body, link, link.substring( link.lastIndexOf("/") - 10, link.lastIndexOf("/")), type); dao.add(newsBean); // gn.writefile(body, link,type); // 写文件 } catch (Exception e) { System.out.println("抓取信息子页面出错,出错信息为:"); e.printStackTrace(); /** 新闻内容处理结束 */ } /** 将标题拼接到字符串中 */ newsStr.append(""); } } } newsStr.append("
+ link + "\">"); newsStr.append(title); newsStr.append("
"
); return newsStr.toString(); } catch (Exception e) { System.out.println("抓取信息出错,出错信息为:"); e.printStackTrace(); return ""; } } // 写文件 public void writefile(String str, String filename, String type) { if (filename.contains(".cn/")) { filename = type + "\\" + filename.substring(filename.indexOf(".cn/") + 4).replace( "/", "_"); } else { filename = type + "\\" + filename; } File file = new File(filename); if (!file.exists() && filename.indexOf("/") != -1)// 如果不存在就创建 { file.mkdirs(); } try { FileWriter writer = new FileWriter(filename); writer.write(str); writer.close(); System.out.println("成功生成新闻页面" + filename); } catch (Exception e) { System.out.println("将信息写入文件" + filename + "发生错误,错误信息为:"); e.printStackTrace(); } } }

你可能感兴趣的:(java)