简单的爬取网站

简单的爬取网站_第1张图片



配置文件:

静态配置

<?xml version="1.0" encoding="UTF-8"?>
<myenv>   
	<!-->起点<-->
	<qidiannet>
		<reStartSpan>1</reStartSpan><!-->每次抓取间隔时间<-->
		<threadNum>4</threadNum><!-->开启的线程数<-->
		<parseUrl>http://www.cc222.com/novel</parseUrl><!-->要抓取的网站前缀<-->
		<classNm>com.crawl.parsehtml.ParseQidian</classNm><!-->要抓取的类实例<-->
	</qidiannet> 
	<!-- -->
	
	
	
	<!-->music365net,zolmobilenet,pipinet,qidiannet,zhuangnet<-->
	<startWorkName>qidiannet</startWorkName>
	
</myenv>

动态配置

<?xml version="1.0" encoding="UTF-8"?>
<myenv>
  <qidiannet>
    <qidiannetDate>2010-02-09 18:48:54</qidiannetDate> 
    <startNum>723819</startNum>
    <endNum>1000000</endNum>
  </qidiannet>
</myenv>



消费者线程

package com.crawl;

import java.util.concurrent.BlockingQueue;

import org.apache.log4j.Logger;

import com.crawl.parsehtml.IParseHtml;
import com.model.Model;
import com.util.XmlUtil;


/**
 * 抓取的消费类,线程
 * @author Administrator
 *
 */
public class CrawConsumer implements Runnable{
	Logger logger = Logger.getLogger(CrawConsumer.class);
	
	private String startWorkName;
	private BlockingQueue<Integer> queue;
	private IParseHtml parseHtml;
	private String url;
	private ICrawlComplete iCrawcomplete;
	private boolean isRunning;
	
	/**
	 * @param startWorkName 要抓取的网站名字
	 * @param queue 要抓取的网址队列
	 * @param parseHtml 要抓取的类
	 * @param url 要抓取的网址前缀
	 * @param iCrawcomplete 抓取完毕后回调接口
	 */
	public CrawConsumer(String startWorkName,BlockingQueue<Integer> queue,IParseHtml parseHtml,String url, ICrawlComplete iCrawcomplete) {
		super();
		this.queue = queue;
		this.parseHtml = parseHtml;
		this.url=url;
		this.startWorkName=startWorkName;
		this.iCrawcomplete=iCrawcomplete;
		this.isRunning=true;
	}

	@Override
	public void run() {
		Integer data=null;
		while(Thread.currentThread().isInterrupted()==false && isRunning)
		{
			//System.out.println("运行线程数"+Thread.getAllStackTraces().size());
			try {
				//因为BlockingQueue是线程安全的,所以不用考虑同步问题
				data = queue.take();
			
				//开始抓取
				Model model=parseHtml.extract(url+"\\"+data+".html");
				
				//回调处理
				if(model!=null)
				{
					iCrawcomplete.save(true,model);
					iCrawcomplete.otherDeal(true,model);
				}
				else
				{
					iCrawcomplete.save(false,null);
					iCrawcomplete.otherDeal(false,null);
				}
				
				//把抓取到的最新id保存到运行时文件
				XmlUtil.updateDataXml(startWorkName,"startNum",data+"",CrawlMain.crawl_RunDate);
				//System.out.println(Thread.currentThread().getName()+"当前抓取网址"+url+data);
	            	
				
            	if(queue.isEmpty()){
                    try {
						Thread.sleep(360*1000);
					} catch (Exception e) {
					}  
					Thread.currentThread().interrupt(); 
                    isRunning = false;  
                }
            	
            	Thread.sleep(1000);
			}catch (Exception e) {
				logger.info("抓取"+url+data+"出现异常"+e.getStackTrace());
			}
		}
		
	}

}

回调类

package com.crawl;

import com.model.Model;

/**
 * 抓取完毕后回调接口
 * @author gt
 *
 */
public interface ICrawlComplete {
	/**
	 * 抓取完毕后保存
	 * @param isSuc 是否抓取成功
	 * @param model 要处理的数据
	 * @return
	 */
	public boolean save(boolean isSuc,Model model);

	/**
	 * 抓取完毕后其他处理
	 * @param isSuc 是否抓取成功
	 * @param model 要处理的数据
	 * @return
	 */
	public boolean otherDeal(boolean isSuc,Model model);
}

抓取类

package com.crawl.parsehtml;

import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeList;

import com.config.Config;
import com.model.Model;
import com.model.Novel;
import com.model.NovelType;
import com.service.NovelService;
import com.util.DateUtil;
import com.util.FileUtil;

public class ParseQidian implements IParseHtml {
	
	@Override
	public Model extract(String url) {
		Novel novel=null;
		try {
		    novel=new Novel();

			NodeFilter nameFilter = new AndFilter(new TagNameFilter("div"),
					new HasAttributeFilter("class", "bookTitle"));
			
	        NodeFilter imageFilter = new AndFilter(new TagNameFilter("img"),
	                new HasAttributeFilter("width", "200"));
	        
	        NodeFilter outlineFilter = new AndFilter(new TagNameFilter("p"),
	                new HasAttributeFilter("class", "gray"));
	        
	        NodeFilter typeFilter = new AndFilter(new TagNameFilter("span"),
	                new HasAttributeFilter("class", "blue"));
	        
	        NodeFilter clickPointFilter = new AndFilter(new TagNameFilter("span"),
	                new HasAttributeFilter("class", "red"));
	        
	        NodeFilter hotFilter = new AndFilter(new TagNameFilter("span"),
	                new HasAttributeFilter("class", "blue"));
	        
			Parser parser = new Parser(url);
			parser.setEncoding("utf-8");
			// 名字和作者
			NodeList nodes = parser.parse(nameFilter);
			//解析样式《小渔村》文/刘皇弟
			String parseStr= nodes.elementAt(0).toPlainTextString();
			String name=parseStr.split("《")[1].split("》")[0];
			String author=parseStr.split("/")[1];
			novel.setName(name);
			novel.setAuthor(author);
            //图片
			parser.reset();
            nodes = parser.parse(imageFilter);
            String imgPath="";
            if (nodes.size() != 0)
            {
                ImageTag profileTag = (ImageTag) nodes.elementAt(0);
                imgPath="http://www.cc222.com"+profileTag.getAttribute("src");
            }
            novel.setImgPath(imgPath);
            //简介
            parser.reset();
            nodes = parser.parse(outlineFilter);
            String outline=nodes.elementAt(0).toPlainTextString();
            novel.setOutline(outline);
            //类型
            parser.reset();
            nodes = parser.parse(typeFilter);
            String typeName=nodes.elementAt(0).toPlainTextString();
            NovelType novelType=new NovelType();
            novelType.setName(typeName);
            novel.setNovelType(novelType);
            //点击率
            parser.reset();
            nodes = parser.parse(clickPointFilter);
            //System.out.println(nodes.elementAt(0).toPlainTextString());
            String clickPoint=nodes.elementAt(0).toPlainTextString();
            novel.setClickPoint(Long.parseLong(clickPoint));
            //热度
            parser.reset();
            nodes = parser.parse(typeFilter);
            String hot=nodes.elementAt(2).toPlainTextString().split("次")[0];
            novel.setHot(Long.parseLong(hot));
           // System.out.println(hotName);
            //其他保存
            //System.out.println(new Timestamp(new Date().getTime()));
            novel.setUpdateTime(DateUtil.convert_DateToTimestamp(new Date()));
            novel.setLink_url(url);
            
			System.out.println(name+":"+author+":"+imgPath+":"+":"+typeName+":");
            
            

			SimpleDateFormat sDateFormat = new SimpleDateFormat(
					"yyyy-MM-dd HH:mm:ss");
			FileUtil.writeFile(sDateFormat.format(new Date()) + ":"
					+ Thread.currentThread().getName() + "parse:" + url
					+ "......Suc",Config.PROJECT_PATH+"\\com\\crawl\\log\\log.txt");

		} catch (Exception e) {
			//System.out.println(url+"地址不存在"+Thread.currentThread().getName());
			//e.printStackTrace();
			return null;
		}
		return novel;
	}
	
	public static void main(String[] args)
	{
		ParseQidian parse=new ParseQidian();
		parse.extract("http://www.cc222.com/novel/799274.html");
	}

}


主程序

/**
	 * 启动抓取程序
	 * @param iCrawcomplete 抓取完毕后回调接口
	 * @throws Exception
	 */
	public void start(ICrawlComplete iCrawcomplete) throws Exception{
		if(isRunable==false)
		{
			logger.info("抓取程序启动成功");
			isRunable=true;
			//获取启动的工程队列
			String[] startWorkQueue= XmlUtil.getNodeText("startWorkName", crawl_Config).split(",");
			int startWorkNum=startWorkQueue.length;
			IParseHtml instance=null;
			for(int i=0;i<startWorkNum;i++)
			{
				//获取抓取工程名
				String startWorkName=startWorkQueue[i];
				//获取抓取工程的类名
				String classType=XmlUtil.getNodeText(startWorkName,"classNm", crawl_Config);
				//获取开启线程数
				Integer threadNum=Integer.parseInt(XmlUtil.getNodeText(startWorkName,"threadNum", crawl_Config));
				//获取网址
				String parseUrl=XmlUtil.getNodeText(startWorkName,"parseUrl", crawl_Config);
				
				Integer startNum=Integer.parseInt(XmlUtil.getNodeText(startWorkName,"startNum", crawl_RunDate));
				Integer endNum=Integer.parseInt(XmlUtil.getNodeText(startWorkName,"endNum", crawl_RunDate));
				//System.out.println(classType+":"+parseUrl+":"+threadNum+":"+startNum+":"+endNum);
				//获取每次抓取间隔周期(version1.0没用)
				//String reStartSpan=XmlUtil.getNodeText(startWorkName,"reStartSpan", crawl_Config);
				
				//反射,通过类名动态获取抓取类
				instance=(IParseHtml)Class.forName(classType).newInstance();
				
				//生成要抓取的队列
				LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<Integer>();
				for(int j=startNum;j<endNum;j++)
				{
					queue.offer(j);
				}
				
				//创建个无界带自动回收机制的线程池
		        ExecutorService threadPool = Executors.newCachedThreadPool();
		        //创建消费者线程
		        CrawConsumer consumer=new CrawConsumer(startWorkName,queue,instance,parseUrl,iCrawcomplete);

		        //根据线程数启动线程
		        for(int z=0;z<threadNum;z++)
		        	threadPool.execute(consumer);


		        
			}
		}
		else
		{
			logger.warn("!!抓取程序启动失败,已经在运行");
		}
	}



你可能感兴趣的:(简单的爬取网站)