[仗贱天涯]Code-URLParseThread

/* * 创建日期 2005-8-25 * @author 猪神 */ package com.ty.parse; import java.io.IOException; import java.util.ArrayList; import java.util.List; import com.ty.cc.Lexer; import com.ty.cfg.TYConfig; import com.ty.dao.DocumentDao; import com.ty.domain.Document; import com.ty.net.PageConnector; import com.ty.util.Logger; import com.ty.util.StringUtil; /** * @author Administrator * * TODO 要更改此生成的类型注释的模板,请转至 * 窗口 - 首选项 - Java - 代码样式 - 代码模板 */ public class URLParseThread extends AbstractParseThread{ private String url; private String pattern; private int maxLevel=1; private boolean forceUpdate=false; public URLParseThread(String url,int maxLevel){ this(url,TYConfig.getInstance().getValue("/ty/url/station"),maxLevel, Boolean.valueOf(TYConfig.getInstance().getNode("/ty/url").getAttribute("forceUpdate")).booleanValue()); } public URLParseThread(String url,String pattern,int maxLevel,boolean forceUpdate){ this.url=url; this.pattern=pattern; this.maxLevel=maxLevel; this.forceUpdate=forceUpdate; } public Object parse() throws ParseException { myParse(url,0,-1); return null; } private void myParse(String url,int level,int fromDoc)throws ParseException { if(this.isPostedStop()){//中断 return; } List list=new ArrayList(); try{ if(url==null||!url.startsWith("http://")){ return; } Lexer lex=new Lexer(PageConnector.open(url)); Document doc=DocumentDao.getDocument(url.hashCode()); if(!forceUpdate&&doc!=null&&!doc.isNeedUpdate(maxLevel-level)){ return; } String base=null; String station=null; int i=url.substring("http://".length()).lastIndexOf('/'); if(i!=-1){ base=url.substring(0,"http://".length()+i); }else{ base=url; } station=base; i=base.substring("http://".length()).indexOf('/'); if(i!=-1){ station=base.substring(0,"http://".length()+i); } // Logger.log("station="+station); // Logger.log("base="+base); String w; String title=null; while((w=lex.nextToken())!=null){ //<a href='Content.asp?idWriter=0&Key=0&idItem=143&idArticle=522257' target='_blank'> if(w.startsWith("<a ")){ w=buildURL(w,station,base); if(w!=null&&(pattern==null||w.matches(pattern))){ list.add(w); Logger.log(level+" "+w); } }else if(w.indexOf("<title>")!=-1){//TODO 大小写 title=lex.nextToken();//文档标题 } } //记录url为已经解析 if(doc==null){ doc=Document.create(title,url,lex.getSourceText(),maxLevel-level,fromDoc); DocumentDao.createDocument(doc); }else{ doc.setContent(lex.getSourceText()); doc.setFromDoc(fromDoc); doc.setLastUpdated(StringUtil.currentTimestamp()); doc.setParseSubLevel(maxLevel-level); DocumentDao.updateDocument(doc); } }catch(IOException ex){ }catch(Exception ex){ ex.printStackTrace(); // throw new ParseException(ex); } //搜索深度 int nextFromDoc=url.hashCode(); level++; if(level<maxLevel||maxLevel<=0){ for(int i=0,n=list.size();i<n;i++){ myParse((String)list.get(i),level,nextFromDoc); } } } private String buildURL(String w,String station,String base){ int b=w.indexOf("href="); if(b==-1){ return null; } StringBuffer sb=new StringBuffer(); boolean begin=false; boolean end=false; char ch; for(int i=b+"href=".length(),n=w.length();i<n;i++){ ch=w.charAt(i); switch(ch){ case ' ': case '/t': case '/n': if(begin){ end=true; break; } continue; case '/'': case '"': case '>': if(begin){ end=true; break; }else{ begin=true; continue; } default: if(!begin){ begin=true; } } if(end){ break; } sb.append(ch); } if(sb.length()==0){ return null; } if(sb.toString().startsWith("http://")){ }else if(sb.charAt(0)=='/'){ sb.insert(0,station); }else{ sb.insert(0,base+'/'); } return sb.toString(); } } //TODO 优化:小规模搜索,URL在内存中访问代替数据库,自动分段持久化。

你可能感兴趣的:(exception,String,list,null,url,import)