cralwerDemo

package com.mnmlist.crawler2;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.tags.*;
import java.io.*;
import java.net.*;
import java.nio.*;
import java.util.List;
import javax.management.*;

public class ClawerDemo {
	/* 月份文章的月份名称/月份存档URL对的列表 */
	final static AttributeList monthIndexList = new AttributeList();
	/* 每月文章名称/每月文章的URL对的列表 */
	final static AttributeList monthArticleList = new AttributeList();
	/* 每篇文章图片本地存档地址/每篇文章图片URL对的列表 或者存储文章标题,作者的姓名*/
	final static AttributeList imageResourceList = new AttributeList();
	/* 保存月份以及该月文章本地存档的列表,用于生成目录 */
	static AttributeList storeMonthList = new AttributeList();
	/* 用于生成本地存档目录的writer */
	static OutputStreamWriter dirIndexWirter = null;
	static String proxy_addr = null;
	static int proxy_port = 3128;

	/*
	 * @param url 网页的URL
	 * 
	 * @param type 类型:1为文本,0为二进制
	 * 
	 * @return 内容的字节数组
	 */
	public static byte[] getContent(String url, int type) {
		byte ret[] = null;
		try {
			HttpURLConnection conn = null;
			InputStream urlStream = null;
			URL surl = new URL(url);
			int j = -1;
			if (proxy_addr != null) {
				InetSocketAddress soA = new InetSocketAddress(
						InetAddress.getByName(proxy_addr), proxy_port);
				Proxy proxy = new Proxy(Proxy.Type.HTTP, soA);
				conn = (HttpURLConnection) surl.openConnection(proxy);
			} else {
				conn = (HttpURLConnection) surl.openConnection();
			}
			/* 必须加上这一句伪装成Mozilla浏览器,否则CSDN会拒绝连接 */
			conn.setRequestProperty("User-Agent", "Mozilla/4.0");
			conn.connect();
			urlStream = conn.getInputStream();
			if (type == 1) {
				StringBuilder sBuilder = new StringBuilder();
				BufferedReader reader = new BufferedReader(
						new InputStreamReader(urlStream, "UTF-8"));
				char[] arr = new char[1024];
				while ((j = reader.read(arr)) != -1)
					sBuilder.append(arr, 0, j);
				ret = sBuilder.toString().getBytes();
			} else {
				/* CSDN允许最大图片有上限 */
				byte imgByte[] = new byte[1024];
				ByteBuffer buffer = ByteBuffer.allocate(5000000);
				while ((j = urlStream.read(imgByte)) != -1) {
					buffer.put(imgByte, 0, j);
				}
				ret = buffer.array();
			}
		} catch (Exception e) {
			e.printStackTrace();
			// 追加出错日志
		}
		return ret;
	}

	/*
	 * @param path 文件路径
	 * 
	 * @param url 文章在blog上的URL
	 * 
	 * @param articles 保存本月存档的列表
	 * 
	 * @return 无
	 */
	public static void handleHtml(String path, String url,
			AttributeList articles) {// 用户名/月份,月份的url或文章的url,文章的列表
		try {
			StringBuffer text = new StringBuffer();
			NodeList nodes = handleText(new String(getContent(url, 1)), 3);
			Node node = nodes.elementAt(0);
			String title = (String) ((List<Attribute>) imageResourceList
					.asList()).get(0).getValue();

			String filepath = path + "/" + title;
			List<Attribute> li = imageResourceList.asList();
			/* 加入meta信息 */
			text.append(new String(
					"<meta http-equiv=\"Content-Type\" content=\"text/html; chaset=utf-8\"/>"));
			text.append("\r\n");
			text.append("<h1>" + title + "</h1>");
			text.append("\r\n");
			if (node != null) {
				Div dv = (Div) node;
				text.append(new String(dv.toHtml().getBytes("UTF-8"), "UTF-8"));
				text.append("\r\n");
			} else {
				text.append("<h3>Download error</h3>");
				text.append("\r\n");
			}

			ClawerDemo.makeDir(filepath + "_files");
			articles.add(new Attribute(filepath.split("/", 2)[1], title));

			for (int i = 1; i < li.size(); i++) {
				byte[] imgString = getContent((String) li.get(i).getValue(), 0);
				ClawerDemo.writeFile(filepath + "_files/" + li.get(i).getName()
						+ ".gif", imgString);
			}
			imageResourceList.clear();
			ClawerDemo
					.writeFile(filepath + ".html", text.toString().getBytes());
		} catch (Exception e) {
			// 追加出错日志
			e.printStackTrace();
		}
	}

	/*
	 * @param input 输入的html文档字符串
	 * 
	 * @param skip 是否执行的类别
	 * 
	 * @return 匹配的链表,很多类别通过副作用而起作用
	 */
	public static NodeList handleText(String input, final int skip)
			throws Exception {
		Parser parser = Parser.createParser(input, "UTF-8");
		NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() {
			public boolean accept(Node node) {
				if (node instanceof Div) {
					Div dv = (Div) node;
					NodeList nlist = dv.getChildren();
					if (dv.getAttribute("id") != null && nlist != null) {
						if (dv.getAttribute("id").equalsIgnoreCase(
								"article_content")// 文章
								&& skip == 3) {
							parseImg(nlist, 0);
							return true;
						} else if (dv.getAttribute("id").equalsIgnoreCase(
								"article_details")// 文章
								&& skip == 3) {
							parseTitle(nlist, 0);
						} else if (dv.getAttribute("id").equalsIgnoreCase(
								"archive_list")// 月份信息及对应的url
								&& (skip == 1 || skip == 4)) {
							parseMonthArticle(nlist, 0);
						} else if (dv.getAttribute("id").equalsIgnoreCase(
								"papelist")// 分页信息
								&& skip == 2) {
							parsePage(nlist, 0);// from parseMonth
						} else if (dv.getAttribute("id").equalsIgnoreCase(
								"blog_title")// 总目录,月份目录,博客文章都有
								&& skip == 4) {
							parseAuthor(nlist, 0);
						}
					}
					if (dv.getAttribute("class") != null && nlist != null) {
						if (dv.getAttribute("class").equalsIgnoreCase(
								"article_title")// 均有
								&& skip == 2) {
							parsePerArticle(nlist, 0);
						}
					}
				}
				return false;
			}
		});
		return nodes;
	}

	/*
	 * @param nlist HTML正文的子标签链表
	 * 
	 * @param index 用于索引图片的个数以及当前的图片数
	 * 
	 * @return 当前的图片数
	 */
	public static int parseImg(NodeList nlist, int index) {
		Node img = null;
		int count = nlist.size();
		for (int i = 0; i < count; i++) {
			img = nlist.elementAt(i);
			if (img instanceof ImageTag) {
				ImageTag imgtag = (ImageTag) img;
				if (!imgtag.isEndTag()) {
					String title = (String) ((List<Attribute>) imageResourceList
							.asList()).get(0).getValue();
					/* 将图片的URL映射成本地路径 */
					imageResourceList.add(new Attribute("" + index, new String(
							imgtag.extractImageLocn().getBytes())));
					title = title.trim();
					imgtag.setImageURL(title + "_files/" + index + ".gif");
					/* 递增本地路径序列 */
					index++;
				}
			} else {
				NodeList slist = img.getChildren();
				if (slist != null && slist.size() > 0) {
					index = ClawerDemo.parseImg(slist, index);
				}
			}
		}
		return index;
	}

	/*
	 * @param nlist HTML月份存档的子标签链表
	 * 
	 * @param index 无用
	 * 
	 * @return 无用
	 */
	public static int parseMonthArticle(NodeList nlist, int index) {
		Node atls = null;
		int count = nlist.size();
		for (int i = 0; i < count; i++) {
			atls = nlist.elementAt(i);
			if (atls instanceof LinkTag) {
				LinkTag link = (LinkTag) atls;
				monthIndexList.add(new Attribute(link.getLinkText(), link
						.extractLink()));
			} else {
				NodeList slist = atls.getChildren();
				if (slist != null && slist.size() > 0) {
					index = ClawerDemo.parseMonthArticle(slist, index);
				}
			}
		}
		return index;
	}

	/*
	 * @param nlist HTML标题的子标签链表
	 * 
	 * @param index 无用
	 * 
	 * @return 无用
	 */
	public static int parseTitle(NodeList nlist, int index) {
		Node tit = null;
		int count = nlist.size();
		for (int i = 0; i < count; i++) {
			tit = nlist.elementAt(i);
			if (tit instanceof Span) {
				Span span = (Span) tit;
				if (span.getAttribute("class") != null
						&& span.getAttribute("class").equalsIgnoreCase(
								"link_title")) {
					LinkTag link = (LinkTag) span.childAt(0);
					String title = link.getLinkText();
					/* 将文件名中不允许的字符替换成允许的字符 */
					title = title.replace('/', '-');
					title = title.trim();
					title = title.replace(' ', '-');
					imageResourceList.add(new Attribute("title", title));
				}
			} else {
				NodeList slist = tit.getChildren();
				if (slist != null && slist.size() > 0) {
					index = ClawerDemo.parseTitle(slist, index);
				}
			}
		}
		return index;
	}

	/*
	 * @param nlist HTML每月份存档的子标签链表
	 * 
	 * @param index 无用
	 * 
	 * @return 无用
	 */
	public static int parsePerArticle(NodeList nlist, int index) {
		Node atl = null;
		int count = nlist.size();
		for (int i = 0; i < count; i++) {
			atl = nlist.elementAt(i);
			if (atl instanceof Span) {
				Span span = (Span) atl;
				if (span.getAttribute("class") != null
						&& span.getAttribute("class").equalsIgnoreCase(
								"link_title")) {
					LinkTag link = (LinkTag) span.childAt(0);
					monthArticleList.add(new Attribute(link.getLinkText(),
							"http://blog.csdn.net" + link.extractLink()));
				}
			} else {
				NodeList slist = atl.getChildren();
				if (slist != null && slist.size() > 0) {
					index = ClawerDemo.parsePerArticle(slist, index);
				}
			}
		}
		return index;
	}

	/*
	 * @param nlist HTML分页显示标签的子标签链表
	 * 
	 * @param index 无用
	 * 
	 * @return 无用
	 */
	public static int parsePage(NodeList nlist, int index) {// from parseMonth
		Node pg = null;
		int count = nlist.size();
		for (int i = 0; i < count; i++) {
			pg = nlist.elementAt(i);
			if (pg instanceof LinkTag) {
				LinkTag lt = (LinkTag) pg;
				if (lt.getLinkText().equalsIgnoreCase("下一页")) {
					try {
						ClawerDemo.handleText(
								new String(ClawerDemo.getContent(
										"http://blog.csdn.net"
												+ lt.extractLink(), 1)), 2);
					} catch (Exception e) {
						// 追加出错日志
					}
				}
			}
		}
		return index;
	}

	/*
	 * @param nlist HTML作者信息标签的子标签链表
	 * 
	 * @param index 无用
	 * 
	 * @return 无用
	 */
	public static int parseAuthor(NodeList nlist, int index) {
		Node aut = null;
		int count = nlist.size();
		for (int i = 0; i < count; i++) {
			aut = nlist.elementAt(i);
			if (aut instanceof LinkTag) {
				LinkTag link = (LinkTag) aut;
				imageResourceList.add(new Attribute("author", link
						.getLinkText()));
			} else {
				NodeList slist = aut.getChildren();
				if (slist != null && slist.size() > 0) {
					index = ClawerDemo.parseAuthor(slist, index);
				}
			}
		}
		return index;
	}

	/*
	 * @param filepath 本地存档的路径
	 * 
	 * @param url 保存本月存档的网页的URL
	 * 
	 * @param articles 保存本月存档的链表
	 * 
	 * @return 无
	 */
	public static void parseMonth(String filepath, String url,
			AttributeList articles) {
		List<Attribute> li = monthArticleList.asList();
		try {
			handleText(new String(getContent(url, 1)), 2);// get the article
															// links of a month
		} catch (Exception e) {
			// 追加出错日志
		}
		ClawerDemo.makeDir(filepath);
		for (int i = 0; i < li.size(); i++) {
			handleHtml(filepath, (String) li.get(i).getValue(), articles);
			try {
				/* 慢一点,否则会被认为是恶意行为 */
				Thread.sleep(500);
			} catch (Exception e) {
			}
		}
		monthArticleList.clear();
	}

	/*
	 * @param url blog入口文章的URL
	 * 
	 * @return 无
	 */
	public static void parseAll(String url) {
		try {
			String author = null;
			handleText(new String(getContent(url, 1)), 4);// parseMonthArticle
															// ->
															// monthIndexList,
			// parseAuthor->imageResourceList->author
			author = (String) ((List<Attribute>) imageResourceList.asList())
					.get(0).getValue();
			imageResourceList.clear();
			ClawerDemo.makeDir(author);
			List<Attribute> li = monthIndexList.asList();// 获得月份的文本和链接
			for (int i = 0; i < li.size(); i++) {
				AttributeList articles = new AttributeList();
				storeMonthList
						.add(new Attribute(li.get(i).getName(), articles));// 月份和月份对应的文章
				parseMonth(author + "/" + li.get(i).getName(),// 作者\月份,链接,文章列表
						(String) li.get(i).getValue(), articles);
			}
			handleIndex(author);
		} catch (Exception e) {
			e.printStackTrace();
		}
		monthIndexList.clear();
	}

	/*
	 * @param dir 本地存档根路径名称
	 * 
	 * @return 无
	 */
	static void handleIndex(String dir) {
		try {
			dirIndexWirter = new OutputStreamWriter(new FileOutputStream(dir
					+ "/index.html"), "GB18030");
			String header = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><title>CSDN文章归档</title></head><body bgcolor=\"white\" text=\"black\" link=\"#0000FF\" vlink=\"#840084\" alink=\"#0000FF\"><hr></div><div><h1 class=\"title\"><a name=\"id2747881\"></a>"
					+ dir
					+ "CSDN文章归档</h1></div></div><hr></div><div class=\"toc\"><p><b>目录</b></p><dl><dt><span class=\"preface\"><a href=\"preface.html\">摘要</a></span></dt>";
			String tailer = "</div></div><hr></body></html>";
			dirIndexWirter.write(header);

			List<Attribute> li = storeMonthList.asList();
			for (int i = 0; i < li.size(); i++) {
				String mindex = "<dt><span class=\"part\"><h4>"
						+ li.get(i).getName() + "</span></dt><dd><dl>";
				AttributeList articles = (AttributeList) li.get(i).getValue();
				List<Attribute> al = articles.asList();
				dirIndexWirter.write(mindex);
				for (int j = 0; j < al.size(); j++) {
					String per = "<dt><span class=\"part\"><a href=\""
							+ al.get(j).getName() + ".html\">"
							+ al.get(j).getValue() + "</a></span></dt>";
					dirIndexWirter.write(per);
				}
				dirIndexWirter.write("</dl></dd>");
			}
			dirIndexWirter.write(tailer);
			dirIndexWirter.close();
		} catch (Exception e) {
		}
	}

	/*
	 * @param path 文件路径
	 * 
	 * @param content 文件内容的字节数组
	 * 
	 * @return 成功或者失败
	 */
	public static boolean writeFile(String path, byte[] content) {
		try {
			FileOutputStream osw = new FileOutputStream(path);
			osw.write(content);
			osw.close();
		} catch (Exception e) {
			e.printStackTrace();
			// 追加出错日志
			return false;
		}
		return true;
	}

	/*
	 * @param path 目录路径
	 * 
	 * @return 成功或者失败
	 */
	public static boolean makeDir(String path) {
		try {
			File fp = new File(path);
			if (!fp.exists()) {
				fp.mkdir();
			}
		} catch (Exception e) {
			e.printStackTrace();
			// 追加出错日志
			return false;
		}
		return true;
	}

	/*
	 * @param args args[0]:blog入口文章的URL args[1]:代理地址 args[2]:代理端口 【用法:java
	 * DownBlog http://blog.csdn.net/dog250 192.168.40.199 808】
	 * 
	 * @return 无
	 */
	public static void main(String[] args) throws Exception {
		System.out.println("You can find the essays from:"
				+ System.getProperty("user.dir"));
		parseAll("http://blog.csdn.net/an_tao");
	}
}

你可能感兴趣的:(cralwerDemo)