自己写的一个针对特定网站的网络爬虫,初学者,大家瞧瞧,不好的地方,给点建议!谢谢!

以下是全部代码 

恩其中还 测试了 log4j在非web项目也可以使用(需要log4j的配置文件,log4j.properties或者log4j.xml)

在代码中加载配置文件 获得logger即可详细代码如下

package net.rytong.myspider;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.HashSet;
import java.util.Set;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
/**
 * 
 * @author zhou_dong
 * 下载鬼吹灯的电子书所用的网络爬虫
 *
 */
public class MyHtmlParser {
	//访问过的link
	private static Set vistedLinks = new HashSet();
	//
	private static Integer failureCount = 0;
	//判断是否已经访问过了
	private static boolean judgeUrl(String url)
	{
		boolean flag = false;
		if(vistedLinks != null && vistedLinks.size() > 0)
		{
			if(vistedLinks.contains(url))
			{
				flag = true;
			}
		}
		return flag;
	}
	//加载lo4j文件
	public static Logger loadLog4j()
	{
		PropertyConfigurator.configure("E:/rytong/myeclipsework/my_spider/config/log4j.properties");
		final Logger logger = Logger.getLogger("");
		return logger;
	}
	public static void main(String[] args)  {
		String url = "http://www.bxwx.org/b/3/3870/";
		//获得鬼吹灯的所有的url
		final Set aUrl = getAUrl(url);
		System.out.println("总url数:"+aUrl.size());
		//多线程启动
		myThread(aUrl,5); 
		
		
	}
	private static void myThread(final Set aUrl,Integer count) {
		
		for( int i=0;i"+Thread.currentThread().getName()+":已启动");
				                		
								}
		                    } catch (InterruptedException e) { 
		                        e.printStackTrace(); 
		                        continue;
		                    } 
		                } 
		            } 
		        }, "Thread"+i);
			thread.start();
			// System.out.println("线程"+i+":"+Thread.currentThread().getName()+":启动"); 
      	}
	}

	//读取每一章节的内容,并保存到文本
	private static void myImportTxt(Set aUrl) {
		//获得获取div的id为content的内容的过滤器
		HasAttributeFilter filter = new HasAttributeFilter("id", "content");
		//设置div的过滤器
		OrFilter divContext = new OrFilter(new NodeClassFilter(TitleTag.class),filter);
		OutputStream output = null;
		//PrintWriter pw = null;
		//InputStream input = null;
		int y = 1;
		//for(int num=0;num"+myUrl);
			try {
			Parser parser = new Parser(myUrl);
			
				parser.setEncoding("gb2312");
			//获得所有符合id为content的div的标签集合
			NodeList list = parser.extractAllNodesThatMatch(divContext);
			StringBuffer text = new StringBuffer();
			for (int i = 0; i < list.size(); i++)
			{
				//获得标签的内容
				text = text.append(list.elementAt(i).toPlainTextString() + "\r\n");
			}
			String myText = text.toString();
			//System.out.println(myText);
			byte[] bytes = new byte[1024];
			bytes = myText.getBytes();
			long currentTimeMillis = System.currentTimeMillis();
			String bookTxt  = "E:/rytong/mytext/test4/"+"鬼吹灯"+y+"("+String.valueOf(currentTimeMillis)+").txt";
			output = new FileOutputStream(new File(bookTxt));
			//pw = new PrintWriter(output,true);
			//pw.write(myText);
			output.write(bytes, 0, bytes.length);
			output.flush();
			//System.out.println("被读取的url:"+myUrl);
			//System.out.println("E:/rytong/mytext/test3/"+"鬼吹灯"+y+"("+String.valueOf(currentTimeMillis)+").txt"+"导入成功");
			//System.out.println(bookTxt+"导入成功");
			//log4j的获得及使用
			Logger myLogger = loadLog4j();
			myLogger.info(bookTxt+"导入成功");
			y++;
			//把访问过的url添加到vistedLinks
			vistedLinks.add(myUrl);
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				failureCount++;
				System.out.println(myUrl);
				continue;
			}
		}
			/*//pw.close();
			try {
				//output.close();
			} catch (IOException e) {
				e.printStackTrace();
			}*/
	}
	//获取所有的url路径
	@SuppressWarnings("serial")
	public static Set getAUrl(String url)
	{
		Set myUrls = new HashSet();
		try {
			//解析一个url
			Parser parser = new Parser(url);
			//设置编码
			parser.setEncoding("gb2312");
			//过滤标签的filter  在本文中没有使用到 只是给大家 举个例子 说明filter的用法。
			NodeFilter frameFilter = new NodeFilter() {
				public boolean accept(Node node) {
					String text = node.getText();
					//System.out.println("frame 标签的"+text);
					if(text.startsWith("frame src = "))
					{
						return true;
					}else{
					return false;
					}
				}
			};
			//
			OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter);
			//获得所有匹配的url
			NodeList nodeList = parser.extractAllNodesThatMatch(linkFilter);
			for(int i=0;i



你可能感兴趣的:(java)