访问过;保存过;修改过 等等
package com.rayeen.spider.vertical.data; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import org.apache.hadoop.io.MD5Hash; import org.apache.log4j.Logger; import com.sleepycat.je.DatabaseException; import com.sleepycat.je.Environment; import com.sleepycat.persist.EntityStore; import com.sleepycat.persist.PrimaryIndex; import com.sleepycat.persist.SecondaryIndex; import com.rayeen.spider.vertical.constant.MetResourceTag; import com.rayeen.spider.vertical.constant.MsgConstant; import com.rayeen.spider.vertical.util.ResutTree; import org.apache.commons.lang.*; public class MetResourceProtocolImpl implements MetResourceProtocol { public MetResourceProtocolImpl(String name){ this.name=name; } String name; static final Logger LOG = Logger.getLogger(ResutTree.class); public static String openMode = "append"; public static int hitCnt = 0; private static MyDbEnv myDbEnv = new MyDbEnv(); static EntityStore da; static PrimaryIndexinfoMap = null; static SecondaryIndex infoMapByDigest = null; static int flushCnt = 0; //初始化BDB数据库环境 public synchronized void initilize() throws DatabaseException { Environment myDbEnvironment = null; File met = new File("./metResource"); if(!met.exists()){ met.mkdir(); } File file = new File("./metResource/"+name); try { if (!file.exists()) { if (!file.exists()) { file.mkdir(); } myDbEnv.close(); myDbEnv.setupNoTransact(file, false); } else {// try { myDbEnv.setupAppend(file); } catch (Exception e) { LOG.error(e.getMessage()); file.delete(); file.mkdir(); myDbEnv.setupNoTransact(file, false); } } myDbEnv.setCacheSize(1024); } catch (DatabaseException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { da = myDbEnv.getEntityStore(); infoMap = da.getPrimaryIndex(String.class, MetResourceBE.class); infoMapByDigest = da.getSecondaryIndex(infoMap, String.class, "digest"); } catch (DatabaseException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public int error() throws IOException { throw new IOException("bobo"); } public int getMetResource(String uri, byte[] content) { if(null==uri) return 0; LOG.info("uri:" + uri); try { MetResourceBE metResource = infoMap.get(uri); if (metResource != null) { int status = metResource.getStatus(); String hash = ""; if (content == null) { hash = MD5Hash.digest(uri).toString(); } else { hash = MD5Hash.digest(content).toString(); } int oldStatus = 0; String oldHash = metResource.getDigest(); if (StringUtils.equalsIgnoreCase(oldHash, hash)) { status ^= MetResourceBE.MODIFIED;// 去掉“修改过”标志位,未更改过 status |= MetResourceBE.UNMODIFIED;// 去掉“修改过”标志位,未更改过 } else { status ^= MetResourceBE.UNMODIFIED;//未更改过 status |= MetResourceBE.MODIFIED;// 更改过 } LOG.info("status:" + status); return status; } } catch (DatabaseException e) { // TODO Auto-generated catch block e.printStackTrace(); } return 0; } /** * * @param uri * 地址 * @param content * 内容 * @param status * 当前状态(“经过但是不保存”还是“保存”) * @return * @throws DatabaseException */ public String putMetResource(String uri, byte[] content, int status, MetResourceTag type) throws DatabaseException { String hash = ""; //uri = uri.toLowerCase(); if (content == null) { hash = MD5Hash.digest(uri).toString(); } else { hash = MD5Hash.digest(content).toString(); } MetResourceBE metResource = null; //覆盖 if (type == MetResourceTag.COVER) { metResource = infoMap.get(uri); metResource = new MetResourceBE(); metResource.setUri(uri); metResource.setDigest(hash); metResource.setStatus(status); infoMap.putNoReturn(metResource); myDbEnv.sync(); return MsgConstant.SUCCESS; } int oldStatus = 0; if (infoMap.contains(uri)) { metResource = infoMap.get(uri); oldStatus = metResource.getStatus(); String oldHash = metResource.getDigest(); if (StringUtils.equalsIgnoreCase(oldHash, hash)) { oldStatus ^= MetResourceBE.MODIFIED;// 去掉“修改过”标志位,未更改过 oldStatus |= MetResourceBE.UNMODIFIED;// 去掉“修改过”标志位,未更改过 } else { oldStatus ^= MetResourceBE.UNMODIFIED;// 去掉“未修改过”标志位,未更改过 oldStatus |= MetResourceBE.MODIFIED;// 更改过 } oldStatus |= status; metResource.setStatus(oldStatus); } else {// 遇到过相同的页面 if (infoMapByDigest.contains(hash)) { oldStatus = MetResourceBE.SAME_CONTENT;// 不同uri相同页面 } else {// 没有遇到过相同内容的页面 // 根据status来设置是met还是saved oldStatus = status; } metResource = new MetResourceBE(); metResource.setUri(uri); metResource.setDigest(hash); metResource.setStatus(oldStatus); } if (null != metResource) { infoMap.putNoReturn(metResource); if(flushCnt++ % 5==0){ myDbEnv.sync(); } } return MsgConstant.SUCCESS; } public int getMetResource(String uri) { // TODO Auto-generated method stub return getMetResource(uri, null); } /** * type: 覆盖还是融合 */ public String putMetResource(String uri, String content, int status, MetResourceTag type) throws DatabaseException { try { putMetResource(uri, content.getBytes("UTF-8"), status, type); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (DatabaseException e) { e.printStackTrace(); } if(flushCnt++ % 5==0){ myDbEnv.sync(); } return MsgConstant.SUCCESS; } }
其中MetResourceBE的实现:
package com.rayeen.spider.vertical.data; import java.io.Serializable; import java.util.logging.Logger; import com.sleepycat.persist.*; import com.sleepycat.persist.model.*; import com.sleepycat.persist.model.Entity; import com.sleepycat.persist.model.PrimaryKey; import com.sleepycat.persist.model.SecondaryKey; import com.sleepycat.persist.model.Relationship; @Entity public class MetResourceBE implements Serializable{ @PrimaryKey(sequence="ID") private String uri; @SecondaryKey(relate=Relationship.MANY_TO_ONE) private String digest; private java.util.Date insertTime; private int status; final static public int MET=0x1;//url遇到过 final static public int SAVED=0x2;//url保存过 final static public int MODIFIED=0x4;//内容更新过 final static public int UNMET=0x8;//url未遇到过 final static public int UNSAVED=0x10;//url未保存过 final static public int UNMODIFIED=0x20;//内容未更新 final static public int BLANK=0x0;//未遇到过,未保存过,未更新过 final static public int FULL=MET|SAVED|MODIFIED|UNMET|UNSAVED|UNMODIFIED;// 遇到过, 保存过, 更新过 // //final static public int MET_MODIFIED=0x5;//遇到url相同,内容不同的页面 // final static public int UNMET=0x0;//url遇到过,未更新过,未保存过 // final static public int UNSAVED_UNMODIFY=0x4;//遇到过,更新或未更新过,未保存过 // final static public int UNSAVED_MODIFY=0x6;//遇到过,更新或未更新过,未保存过 //final static public int UNSAVED=0x8;//url保存过 final static public int SAME_CONTENT=0x10;//遇到url不同,内容相同的页面 final static public int SAME_URL=0x20;//遇到url相同,内容不同的页面 // final static public int MODIFIED=0x40;//遇到url相同,内容不同的页面 // final static public int UNMODIFIED=0x80;//遇到url相同,内容不同的页面 public java.util.Date getInsertTime() { return insertTime; } public void setInsertTime(java.util.Date insertTime) { this.insertTime = insertTime; } public String getDigest() { return digest; } public void setDigest(String digest) { this.digest = digest; } public String getUri() { return uri; } public void setUri(String uri) { this.uri = uri; } public int getStatus() { return status; } public void setStatus(int status) { this.status = status; } }
判断是否爬过某个页面的代码片段:
// 在fetchedList非空的情况下才处理以下逻辑 if (null != fetchedList) { int curStatus = fetchedList.getMetResource(realUrl, content .getBytes("UTF-8")); // 如果不应该处理这个页面,那么直接返回 if (ParseUtils.EntranceCantProcess(processStandard, curStatus)) { log(MetResourceUtil.explainMetResourceReason( url.getToUrl(), processStandard, curStatus), LogType.ENTRANCE_CONDITIONAL); return curStatus; } // 否则,记录这个页面的状态(met和unmet状态转换) fetchedList.putMetResource(url.getToUrl(), content .getBytes("UTF-8"), MetResourceBE.UNMET | MetResourceBE.MET, MetResourceTag.MERGE); }
保存数据之后,修改URL状态的代码片段:
public void save(String saveStat) throws SemanticException { // curHierarchyResultMap中保存着到上级页面为止的入口处的信息 // 之前的若干步骤都是为了填充curHierarchyResultMap里的数据 // 这里的curHierarchyResultMap是上层的crawl函数设进参数的HierarchyResultMap // 本层的所有save函数共用这个curHierarchyResultMap // curUri会被enter之后的处理过程还原出来,强行设置为 VMUtils.save(curUri, curPage, curHierarchyResultMap, curCrsc, saveStat); // 保存“已保存过标记” if (null != fetchedList) { try { fetchedList.putMetResource(curUrl.getToUrl(), curContent, MetResourceBE.SAVED | MetResourceBE.UNSAVED, MetResourceTag.MERGE); } catch (DatabaseException e) { e.printStackTrace(); } } }
一些位操作的辅助函数:
static TObjectIntHashMap bitmap = new TObjectIntHashMap(); static { bitmap.put("遇到过", MetResourceBE.MET); bitmap.put("更新过", MetResourceBE.MODIFIED); bitmap.put("保存过", MetResourceBE.SAVED); bitmap.put("未遇到过", MetResourceBE.UNMET); bitmap.put("未更新过", MetResourceBE.UNMODIFIED); bitmap.put("未保存过", MetResourceBE.UNSAVED); } /** * 处理前三位数据 111(未遇到过||更新过||未保存过)& 001(未保存过) 返回前3位数据(位置上的后三位,逻辑上的前3位) * * @param standard * @return * @throws SemanticException */ public static int parseProcessStardard(String standard) throws SemanticException { if (null == standard) return -1; int idx = standard.indexOf(ConfConstant.PROCESS_STANDARD); int status = 0; if (idx != -1) { standard = standard.substring(idx + ConfConstant.PROCESS_STANDARD.length() + 1); } String[] stdsOR = Pattern.compile("||", Pattern.LITERAL).split( standard, 0); int or = MetResourceBE.BLANK; for (String strOR : stdsOR) { strOR = strOR.trim(); int and = MetResourceBE.BLANK; String[] stdsAND = Pattern.compile("&&", Pattern.LITERAL).split( strOR, 0); for (String strAnd : stdsAND) { strAnd = strAnd.trim(); if (!bitmap.containsKey(strAnd)) { ParalleIRVirtualMachine.error( "error enterance strandard grammer:" + strAnd.substring(1), ErrorType.GRAMMER); } and |= bitmap.get(strAnd); } or = or | and; } status = or; return status; } static public boolean canProceess(int standard, int status) { return ((standard | status) & MetResourceBE.FULL) > 0; } static public boolean PageCanProceess(String standardStr, int status) throws SemanticException { int standard = parseProcessStardard(standardStr); return ((standard | status) & MetResourceBE.FULL) > 0; } // 如果:所有遇到过的uri都不进入(不探测是否更新),则在met的情况下,cantEnter返回true static public boolean EntranceCantEnter(String standardStr, int status) throws SemanticException { int standard = parseProcessStardard(standardStr); return EntranceCantEnter(standard, status); } static public boolean EntranceCantEnter(int standard, int status) { if (standard == -1) return false; // url遇到过,未更新过,未保存过 if (standard == MetResourceBE.UNMET && (status & MetResourceBE.MET) > 0) { return true; } // 未保存过+更新过+遇到过+<->当前状态:未保存过+未更新过+遇到过+ // return false; } // 如果标准是:更新过,而status是未更新过 // 或者标准是:未保存过,而status是保存过,那么,本页不需要处理 // 如果标准是:更新过,未保存过,status是未更新过,未保存过,那么,没有不能完全符合,也不能处理 // 不考虑是否遇到过这个页面 // 只处理标准中提到的信息,没提到的无所谓 // 是否需要多加一倍的字段数? 由3位01变成6位01? static public boolean EntranceCantProcess(int standard, int status) { if (standard == -1) return false; // "未遇到过"也就是,status中的"未遇到过"位必须是1 if ((standard & MetResourceBE.UNMET) > 0 && (status & MetResourceBE.UNMET) > 0) { return true; } // 是否更新过 if (((standard & MetResourceBE.MODIFIED) ^ (status & MetResourceBE.MODIFIED)) != 0) { return true; } // 如果标准是:更新过||未保存过||未遇到过 , 而status是未更新过&&未保存过&&遇到过, // 则不处理这个页面 if (standard == (MetResourceBE.MODIFIED | MetResourceBE.SAVED | MetResourceBE.MET) && status == (MetResourceBE.MODIFIED | MetResourceBE.SAVED)) { return true; } return false; }