配置文件:
静态配置
<?xml version="1.0" encoding="UTF-8"?> <myenv> <!-->起点<--> <qidiannet> <reStartSpan>1</reStartSpan><!-->每次抓取间隔时间<--> <threadNum>4</threadNum><!-->开启的线程数<--> <parseUrl>http://www.cc222.com/novel</parseUrl><!-->要抓取的网站前缀<--> <classNm>com.crawl.parsehtml.ParseQidian</classNm><!-->要抓取的类实例<--> </qidiannet> <!-- --> <!-->music365net,zolmobilenet,pipinet,qidiannet,zhuangnet<--> <startWorkName>qidiannet</startWorkName> </myenv>
<?xml version="1.0" encoding="UTF-8"?> <myenv> <qidiannet> <qidiannetDate>2010-02-09 18:48:54</qidiannetDate> <startNum>723819</startNum> <endNum>1000000</endNum> </qidiannet> </myenv>
package com.crawl; import java.util.concurrent.BlockingQueue; import org.apache.log4j.Logger; import com.crawl.parsehtml.IParseHtml; import com.model.Model; import com.util.XmlUtil; /** * 抓取的消费类,线程 * @author Administrator * */ public class CrawConsumer implements Runnable{ Logger logger = Logger.getLogger(CrawConsumer.class); private String startWorkName; private BlockingQueue<Integer> queue; private IParseHtml parseHtml; private String url; private ICrawlComplete iCrawcomplete; private boolean isRunning; /** * @param startWorkName 要抓取的网站名字 * @param queue 要抓取的网址队列 * @param parseHtml 要抓取的类 * @param url 要抓取的网址前缀 * @param iCrawcomplete 抓取完毕后回调接口 */ public CrawConsumer(String startWorkName,BlockingQueue<Integer> queue,IParseHtml parseHtml,String url, ICrawlComplete iCrawcomplete) { super(); this.queue = queue; this.parseHtml = parseHtml; this.url=url; this.startWorkName=startWorkName; this.iCrawcomplete=iCrawcomplete; this.isRunning=true; } @Override public void run() { Integer data=null; while(Thread.currentThread().isInterrupted()==false && isRunning) { //System.out.println("运行线程数"+Thread.getAllStackTraces().size()); try { //因为BlockingQueue是线程安全的,所以不用考虑同步问题 data = queue.take(); //开始抓取 Model model=parseHtml.extract(url+"\\"+data+".html"); //回调处理 if(model!=null) { iCrawcomplete.save(true,model); iCrawcomplete.otherDeal(true,model); } else { iCrawcomplete.save(false,null); iCrawcomplete.otherDeal(false,null); } //把抓取到的最新id保存到运行时文件 XmlUtil.updateDataXml(startWorkName,"startNum",data+"",CrawlMain.crawl_RunDate); //System.out.println(Thread.currentThread().getName()+"当前抓取网址"+url+data); if(queue.isEmpty()){ try { Thread.sleep(360*1000); } catch (Exception e) { } Thread.currentThread().interrupt(); isRunning = false; } Thread.sleep(1000); }catch (Exception e) { logger.info("抓取"+url+data+"出现异常"+e.getStackTrace()); } } } }
package com.crawl; import com.model.Model; /** * 抓取完毕后回调接口 * @author gt * */ public interface ICrawlComplete { /** * 抓取完毕后保存 * @param isSuc 是否抓取成功 * @param model 要处理的数据 * @return */ public boolean save(boolean isSuc,Model model); /** * 抓取完毕后其他处理 * @param isSuc 是否抓取成功 * @param model 要处理的数据 * @return */ public boolean otherDeal(boolean isSuc,Model model); }
抓取类
package com.crawl.parsehtml; import java.io.FileOutputStream; import java.io.PrintWriter; import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.util.Date; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.ImageTag; import org.htmlparser.util.NodeList; import com.config.Config; import com.model.Model; import com.model.Novel; import com.model.NovelType; import com.service.NovelService; import com.util.DateUtil; import com.util.FileUtil; public class ParseQidian implements IParseHtml { @Override public Model extract(String url) { Novel novel=null; try { novel=new Novel(); NodeFilter nameFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "bookTitle")); NodeFilter imageFilter = new AndFilter(new TagNameFilter("img"), new HasAttributeFilter("width", "200")); NodeFilter outlineFilter = new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "gray")); NodeFilter typeFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "blue")); NodeFilter clickPointFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "red")); NodeFilter hotFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "blue")); Parser parser = new Parser(url); parser.setEncoding("utf-8"); // 名字和作者 NodeList nodes = parser.parse(nameFilter); //解析样式《小渔村》文/刘皇弟 String parseStr= nodes.elementAt(0).toPlainTextString(); String name=parseStr.split("《")[1].split("》")[0]; String author=parseStr.split("/")[1]; novel.setName(name); novel.setAuthor(author); //图片 parser.reset(); nodes = parser.parse(imageFilter); String imgPath=""; if (nodes.size() != 0) { ImageTag profileTag = (ImageTag) nodes.elementAt(0); imgPath="http://www.cc222.com"+profileTag.getAttribute("src"); } novel.setImgPath(imgPath); //简介 parser.reset(); nodes = parser.parse(outlineFilter); String outline=nodes.elementAt(0).toPlainTextString(); novel.setOutline(outline); //类型 parser.reset(); nodes = parser.parse(typeFilter); String typeName=nodes.elementAt(0).toPlainTextString(); NovelType novelType=new NovelType(); novelType.setName(typeName); novel.setNovelType(novelType); //点击率 parser.reset(); nodes = parser.parse(clickPointFilter); //System.out.println(nodes.elementAt(0).toPlainTextString()); String clickPoint=nodes.elementAt(0).toPlainTextString(); novel.setClickPoint(Long.parseLong(clickPoint)); //热度 parser.reset(); nodes = parser.parse(typeFilter); String hot=nodes.elementAt(2).toPlainTextString().split("次")[0]; novel.setHot(Long.parseLong(hot)); // System.out.println(hotName); //其他保存 //System.out.println(new Timestamp(new Date().getTime())); novel.setUpdateTime(DateUtil.convert_DateToTimestamp(new Date())); novel.setLink_url(url); System.out.println(name+":"+author+":"+imgPath+":"+":"+typeName+":"); SimpleDateFormat sDateFormat = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss"); FileUtil.writeFile(sDateFormat.format(new Date()) + ":" + Thread.currentThread().getName() + "parse:" + url + "......Suc",Config.PROJECT_PATH+"\\com\\crawl\\log\\log.txt"); } catch (Exception e) { //System.out.println(url+"地址不存在"+Thread.currentThread().getName()); //e.printStackTrace(); return null; } return novel; } public static void main(String[] args) { ParseQidian parse=new ParseQidian(); parse.extract("http://www.cc222.com/novel/799274.html"); } }
主程序
/** * 启动抓取程序 * @param iCrawcomplete 抓取完毕后回调接口 * @throws Exception */ public void start(ICrawlComplete iCrawcomplete) throws Exception{ if(isRunable==false) { logger.info("抓取程序启动成功"); isRunable=true; //获取启动的工程队列 String[] startWorkQueue= XmlUtil.getNodeText("startWorkName", crawl_Config).split(","); int startWorkNum=startWorkQueue.length; IParseHtml instance=null; for(int i=0;i<startWorkNum;i++) { //获取抓取工程名 String startWorkName=startWorkQueue[i]; //获取抓取工程的类名 String classType=XmlUtil.getNodeText(startWorkName,"classNm", crawl_Config); //获取开启线程数 Integer threadNum=Integer.parseInt(XmlUtil.getNodeText(startWorkName,"threadNum", crawl_Config)); //获取网址 String parseUrl=XmlUtil.getNodeText(startWorkName,"parseUrl", crawl_Config); Integer startNum=Integer.parseInt(XmlUtil.getNodeText(startWorkName,"startNum", crawl_RunDate)); Integer endNum=Integer.parseInt(XmlUtil.getNodeText(startWorkName,"endNum", crawl_RunDate)); //System.out.println(classType+":"+parseUrl+":"+threadNum+":"+startNum+":"+endNum); //获取每次抓取间隔周期(version1.0没用) //String reStartSpan=XmlUtil.getNodeText(startWorkName,"reStartSpan", crawl_Config); //反射,通过类名动态获取抓取类 instance=(IParseHtml)Class.forName(classType).newInstance(); //生成要抓取的队列 LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<Integer>(); for(int j=startNum;j<endNum;j++) { queue.offer(j); } //创建个无界带自动回收机制的线程池 ExecutorService threadPool = Executors.newCachedThreadPool(); //创建消费者线程 CrawConsumer consumer=new CrawConsumer(startWorkName,queue,instance,parseUrl,iCrawcomplete); //根据线程数启动线程 for(int z=0;z<threadNum;z++) threadPool.execute(consumer); } } else { logger.warn("!!抓取程序启动失败,已经在运行"); } }