package com.mnmlist.crawler2; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.util.NodeList; import org.htmlparser.tags.*; import java.io.*; import java.net.*; import java.nio.*; import java.util.List; import javax.management.*; public class ClawerDemo { /* 月份文章的月份名称/月份存档URL对的列表 */ final static AttributeList monthIndexList = new AttributeList(); /* 每月文章名称/每月文章的URL对的列表 */ final static AttributeList monthArticleList = new AttributeList(); /* 每篇文章图片本地存档地址/每篇文章图片URL对的列表 或者存储文章标题,作者的姓名*/ final static AttributeList imageResourceList = new AttributeList(); /* 保存月份以及该月文章本地存档的列表,用于生成目录 */ static AttributeList storeMonthList = new AttributeList(); /* 用于生成本地存档目录的writer */ static OutputStreamWriter dirIndexWirter = null; static String proxy_addr = null; static int proxy_port = 3128; /* * @param url 网页的URL * * @param type 类型:1为文本,0为二进制 * * @return 内容的字节数组 */ public static byte[] getContent(String url, int type) { byte ret[] = null; try { HttpURLConnection conn = null; InputStream urlStream = null; URL surl = new URL(url); int j = -1; if (proxy_addr != null) { InetSocketAddress soA = new InetSocketAddress( InetAddress.getByName(proxy_addr), proxy_port); Proxy proxy = new Proxy(Proxy.Type.HTTP, soA); conn = (HttpURLConnection) surl.openConnection(proxy); } else { conn = (HttpURLConnection) surl.openConnection(); } /* 必须加上这一句伪装成Mozilla浏览器,否则CSDN会拒绝连接 */ conn.setRequestProperty("User-Agent", "Mozilla/4.0"); conn.connect(); urlStream = conn.getInputStream(); if (type == 1) { StringBuilder sBuilder = new StringBuilder(); BufferedReader reader = new BufferedReader( new InputStreamReader(urlStream, "UTF-8")); char[] arr = new char[1024]; while ((j = reader.read(arr)) != -1) sBuilder.append(arr, 0, j); ret = sBuilder.toString().getBytes(); } else { /* CSDN允许最大图片有上限 */ byte imgByte[] = new byte[1024]; ByteBuffer buffer = ByteBuffer.allocate(5000000); while ((j = urlStream.read(imgByte)) != -1) { buffer.put(imgByte, 0, j); } ret = buffer.array(); } } catch (Exception e) { e.printStackTrace(); // 追加出错日志 } return ret; } /* * @param path 文件路径 * * @param url 文章在blog上的URL * * @param articles 保存本月存档的列表 * * @return 无 */ public static void handleHtml(String path, String url, AttributeList articles) {// 用户名/月份,月份的url或文章的url,文章的列表 try { StringBuffer text = new StringBuffer(); NodeList nodes = handleText(new String(getContent(url, 1)), 3); Node node = nodes.elementAt(0); String title = (String) ((List<Attribute>) imageResourceList .asList()).get(0).getValue(); String filepath = path + "/" + title; List<Attribute> li = imageResourceList.asList(); /* 加入meta信息 */ text.append(new String( "<meta http-equiv=\"Content-Type\" content=\"text/html; chaset=utf-8\"/>")); text.append("\r\n"); text.append("<h1>" + title + "</h1>"); text.append("\r\n"); if (node != null) { Div dv = (Div) node; text.append(new String(dv.toHtml().getBytes("UTF-8"), "UTF-8")); text.append("\r\n"); } else { text.append("<h3>Download error</h3>"); text.append("\r\n"); } ClawerDemo.makeDir(filepath + "_files"); articles.add(new Attribute(filepath.split("/", 2)[1], title)); for (int i = 1; i < li.size(); i++) { byte[] imgString = getContent((String) li.get(i).getValue(), 0); ClawerDemo.writeFile(filepath + "_files/" + li.get(i).getName() + ".gif", imgString); } imageResourceList.clear(); ClawerDemo .writeFile(filepath + ".html", text.toString().getBytes()); } catch (Exception e) { // 追加出错日志 e.printStackTrace(); } } /* * @param input 输入的html文档字符串 * * @param skip 是否执行的类别 * * @return 匹配的链表,很多类别通过副作用而起作用 */ public static NodeList handleText(String input, final int skip) throws Exception { Parser parser = Parser.createParser(input, "UTF-8"); NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() { public boolean accept(Node node) { if (node instanceof Div) { Div dv = (Div) node; NodeList nlist = dv.getChildren(); if (dv.getAttribute("id") != null && nlist != null) { if (dv.getAttribute("id").equalsIgnoreCase( "article_content")// 文章 && skip == 3) { parseImg(nlist, 0); return true; } else if (dv.getAttribute("id").equalsIgnoreCase( "article_details")// 文章 && skip == 3) { parseTitle(nlist, 0); } else if (dv.getAttribute("id").equalsIgnoreCase( "archive_list")// 月份信息及对应的url && (skip == 1 || skip == 4)) { parseMonthArticle(nlist, 0); } else if (dv.getAttribute("id").equalsIgnoreCase( "papelist")// 分页信息 && skip == 2) { parsePage(nlist, 0);// from parseMonth } else if (dv.getAttribute("id").equalsIgnoreCase( "blog_title")// 总目录,月份目录,博客文章都有 && skip == 4) { parseAuthor(nlist, 0); } } if (dv.getAttribute("class") != null && nlist != null) { if (dv.getAttribute("class").equalsIgnoreCase( "article_title")// 均有 && skip == 2) { parsePerArticle(nlist, 0); } } } return false; } }); return nodes; } /* * @param nlist HTML正文的子标签链表 * * @param index 用于索引图片的个数以及当前的图片数 * * @return 当前的图片数 */ public static int parseImg(NodeList nlist, int index) { Node img = null; int count = nlist.size(); for (int i = 0; i < count; i++) { img = nlist.elementAt(i); if (img instanceof ImageTag) { ImageTag imgtag = (ImageTag) img; if (!imgtag.isEndTag()) { String title = (String) ((List<Attribute>) imageResourceList .asList()).get(0).getValue(); /* 将图片的URL映射成本地路径 */ imageResourceList.add(new Attribute("" + index, new String( imgtag.extractImageLocn().getBytes()))); title = title.trim(); imgtag.setImageURL(title + "_files/" + index + ".gif"); /* 递增本地路径序列 */ index++; } } else { NodeList slist = img.getChildren(); if (slist != null && slist.size() > 0) { index = ClawerDemo.parseImg(slist, index); } } } return index; } /* * @param nlist HTML月份存档的子标签链表 * * @param index 无用 * * @return 无用 */ public static int parseMonthArticle(NodeList nlist, int index) { Node atls = null; int count = nlist.size(); for (int i = 0; i < count; i++) { atls = nlist.elementAt(i); if (atls instanceof LinkTag) { LinkTag link = (LinkTag) atls; monthIndexList.add(new Attribute(link.getLinkText(), link .extractLink())); } else { NodeList slist = atls.getChildren(); if (slist != null && slist.size() > 0) { index = ClawerDemo.parseMonthArticle(slist, index); } } } return index; } /* * @param nlist HTML标题的子标签链表 * * @param index 无用 * * @return 无用 */ public static int parseTitle(NodeList nlist, int index) { Node tit = null; int count = nlist.size(); for (int i = 0; i < count; i++) { tit = nlist.elementAt(i); if (tit instanceof Span) { Span span = (Span) tit; if (span.getAttribute("class") != null && span.getAttribute("class").equalsIgnoreCase( "link_title")) { LinkTag link = (LinkTag) span.childAt(0); String title = link.getLinkText(); /* 将文件名中不允许的字符替换成允许的字符 */ title = title.replace('/', '-'); title = title.trim(); title = title.replace(' ', '-'); imageResourceList.add(new Attribute("title", title)); } } else { NodeList slist = tit.getChildren(); if (slist != null && slist.size() > 0) { index = ClawerDemo.parseTitle(slist, index); } } } return index; } /* * @param nlist HTML每月份存档的子标签链表 * * @param index 无用 * * @return 无用 */ public static int parsePerArticle(NodeList nlist, int index) { Node atl = null; int count = nlist.size(); for (int i = 0; i < count; i++) { atl = nlist.elementAt(i); if (atl instanceof Span) { Span span = (Span) atl; if (span.getAttribute("class") != null && span.getAttribute("class").equalsIgnoreCase( "link_title")) { LinkTag link = (LinkTag) span.childAt(0); monthArticleList.add(new Attribute(link.getLinkText(), "http://blog.csdn.net" + link.extractLink())); } } else { NodeList slist = atl.getChildren(); if (slist != null && slist.size() > 0) { index = ClawerDemo.parsePerArticle(slist, index); } } } return index; } /* * @param nlist HTML分页显示标签的子标签链表 * * @param index 无用 * * @return 无用 */ public static int parsePage(NodeList nlist, int index) {// from parseMonth Node pg = null; int count = nlist.size(); for (int i = 0; i < count; i++) { pg = nlist.elementAt(i); if (pg instanceof LinkTag) { LinkTag lt = (LinkTag) pg; if (lt.getLinkText().equalsIgnoreCase("下一页")) { try { ClawerDemo.handleText( new String(ClawerDemo.getContent( "http://blog.csdn.net" + lt.extractLink(), 1)), 2); } catch (Exception e) { // 追加出错日志 } } } } return index; } /* * @param nlist HTML作者信息标签的子标签链表 * * @param index 无用 * * @return 无用 */ public static int parseAuthor(NodeList nlist, int index) { Node aut = null; int count = nlist.size(); for (int i = 0; i < count; i++) { aut = nlist.elementAt(i); if (aut instanceof LinkTag) { LinkTag link = (LinkTag) aut; imageResourceList.add(new Attribute("author", link .getLinkText())); } else { NodeList slist = aut.getChildren(); if (slist != null && slist.size() > 0) { index = ClawerDemo.parseAuthor(slist, index); } } } return index; } /* * @param filepath 本地存档的路径 * * @param url 保存本月存档的网页的URL * * @param articles 保存本月存档的链表 * * @return 无 */ public static void parseMonth(String filepath, String url, AttributeList articles) { List<Attribute> li = monthArticleList.asList(); try { handleText(new String(getContent(url, 1)), 2);// get the article // links of a month } catch (Exception e) { // 追加出错日志 } ClawerDemo.makeDir(filepath); for (int i = 0; i < li.size(); i++) { handleHtml(filepath, (String) li.get(i).getValue(), articles); try { /* 慢一点,否则会被认为是恶意行为 */ Thread.sleep(500); } catch (Exception e) { } } monthArticleList.clear(); } /* * @param url blog入口文章的URL * * @return 无 */ public static void parseAll(String url) { try { String author = null; handleText(new String(getContent(url, 1)), 4);// parseMonthArticle // -> // monthIndexList, // parseAuthor->imageResourceList->author author = (String) ((List<Attribute>) imageResourceList.asList()) .get(0).getValue(); imageResourceList.clear(); ClawerDemo.makeDir(author); List<Attribute> li = monthIndexList.asList();// 获得月份的文本和链接 for (int i = 0; i < li.size(); i++) { AttributeList articles = new AttributeList(); storeMonthList .add(new Attribute(li.get(i).getName(), articles));// 月份和月份对应的文章 parseMonth(author + "/" + li.get(i).getName(),// 作者\月份,链接,文章列表 (String) li.get(i).getValue(), articles); } handleIndex(author); } catch (Exception e) { e.printStackTrace(); } monthIndexList.clear(); } /* * @param dir 本地存档根路径名称 * * @return 无 */ static void handleIndex(String dir) { try { dirIndexWirter = new OutputStreamWriter(new FileOutputStream(dir + "/index.html"), "GB18030"); String header = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><title>CSDN文章归档</title></head><body bgcolor=\"white\" text=\"black\" link=\"#0000FF\" vlink=\"#840084\" alink=\"#0000FF\"><hr></div><div><h1 class=\"title\"><a name=\"id2747881\"></a>" + dir + "CSDN文章归档</h1></div></div><hr></div><div class=\"toc\"><p><b>目录</b></p><dl><dt><span class=\"preface\"><a href=\"preface.html\">摘要</a></span></dt>"; String tailer = "</div></div><hr></body></html>"; dirIndexWirter.write(header); List<Attribute> li = storeMonthList.asList(); for (int i = 0; i < li.size(); i++) { String mindex = "<dt><span class=\"part\"><h4>" + li.get(i).getName() + "</span></dt><dd><dl>"; AttributeList articles = (AttributeList) li.get(i).getValue(); List<Attribute> al = articles.asList(); dirIndexWirter.write(mindex); for (int j = 0; j < al.size(); j++) { String per = "<dt><span class=\"part\"><a href=\"" + al.get(j).getName() + ".html\">" + al.get(j).getValue() + "</a></span></dt>"; dirIndexWirter.write(per); } dirIndexWirter.write("</dl></dd>"); } dirIndexWirter.write(tailer); dirIndexWirter.close(); } catch (Exception e) { } } /* * @param path 文件路径 * * @param content 文件内容的字节数组 * * @return 成功或者失败 */ public static boolean writeFile(String path, byte[] content) { try { FileOutputStream osw = new FileOutputStream(path); osw.write(content); osw.close(); } catch (Exception e) { e.printStackTrace(); // 追加出错日志 return false; } return true; } /* * @param path 目录路径 * * @return 成功或者失败 */ public static boolean makeDir(String path) { try { File fp = new File(path); if (!fp.exists()) { fp.mkdir(); } } catch (Exception e) { e.printStackTrace(); // 追加出错日志 return false; } return true; } /* * @param args args[0]:blog入口文章的URL args[1]:代理地址 args[2]:代理端口 【用法:java * DownBlog http://blog.csdn.net/dog250 192.168.40.199 808】 * * @return 无 */ public static void main(String[] args) throws Exception { System.out.println("You can find the essays from:" + System.getProperty("user.dir")); parseAll("http://blog.csdn.net/an_tao"); } }