LSH应用:根据现有数组去mongodb中去数组 然后使用LSH算法计算数组相识度

mongodb中的数组是图片通过CEDD算法算出来的数组。


package com.lsh.common;

import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;

import com.lsh.dao.MongoDao;
import com.lsh.dao.MongoCollectionUtil;
import com.lsh.dao.impl.MongoDaoImpl;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.sdicons.json.mapper.MapperException;


public class SimpleLSH {
	private static MongoDao mongoDao=new MongoDaoImpl();
	private static int dimention = Constant.DIMENTION; //维度大小,例如对于sift特征来说就是128  
	private static int max = Constant.MAX; //所需向量中元素可能的上限,譬如对于RGB来说,就是255  
	private static int hashCount = Constant.HASHCOUNT; //哈希表的数量,用于更大程度地削减false positive  
	//LSH随机选取的采样位数,该值越小,则近似查找能力越大,但相应的false positive也越大;若该值等于size,则为由近似查找退化为精确匹配  
	private static int bitCount = Constant.BITCOUNT;   
	private static int size = dimention * max;   //转化为01字符串之后的位数,等于max乘以dimensions  
	private static int[][] hashFamily; //LSH哈希族,保存了随机采样点的INDEX  
	 
	public SimpleLSH(){
		dimention = Constant.DIMENTION;  
	    max = Constant.MAX;  
	    hashCount = Constant.HASHCOUNT;  
	    bitCount = Constant.BITCOUNT;  
	    size = dimention * max;  
	    hashFamily = new int[hashCount][bitCount];  
	    generataHashFamily();
	}	  
	//生成随机的投影点 ,在程序第一次执行时生成。投影点可以理解为后面去数组的索引值!以后程序重启只需要从mongodb中读取。
	@SuppressWarnings("unchecked")
	private void generataHashFamily() {  
		if (mongoDao.getValueByKey("hashFamily")==null) {  //此为调用mongodb dao层方法
			List list=new ArrayList();
			Random rd = new Random();  
		    for (int i = 0; i < hashCount; i++) {
		    	int[] tmp = new int[bitCount];
			    for (int j = 0; j < bitCount; j++) { 			    	
			        hashFamily[i][j] =rd.nextInt(size);
			        tmp[j]=hashFamily[i][j];
			    } 
			    list.add(tmp);
		    }
		    try {
				mongoDao.addValue(list);
			} catch (MapperException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}else{
			hashFamily=mongoDao.getValueByKey("hashFamily");
		}
	}  
	  
	//将向量转化为二进制字符串,比如元素的最大范围255,则元素65就被转化为65个1以及190个0  
	private static int[] unAray(int[] data) {  
	    int unArayData[] = new int[size];  
	    for (int i = 0; i < data.length; i++) {  
	        for (int j = 0; j < data[i]; j++) {  
	        unArayData[i * max + j] = 1;  
	        }  
	    }  	    
	    return unArayData;  
	}  
	  
	//将向量映射为LSH中的key  
	private static String generateHashKey(int[] list, int hashNum) {  
	    StringBuilder sb = new StringBuilder();  
	    int[] tempData = unAray(list);  
	    int[] hashedData = new int[bitCount];  
	    //首先将向量转为二进制字符串  
	    for (int i = 0; i < bitCount; i++) {  
	        hashedData[i] = tempData[hashFamily[hashNum][i]];  
	        sb.append(hashedData[i]);  
	    }  
	      
	    //再用常规hash函数比如MD5对key进行压缩  
	    MessageDigest messageDigest = null;  
	    try   
	    {  
	        messageDigest = MessageDigest.getInstance("MD5");  
	    }  
	    catch (NoSuchAlgorithmException e) {  
	    }  
	  
	    byte[] binary = sb.toString().getBytes();  
	    byte[] hash = messageDigest.digest(binary);  
	    String hashV = MD5Util.bufferToHex(hash);  
	    return hashNum+"-"+hashV;  
	}  
	
	
	private static String generateHashKey(int[] list) {  
	    StringBuilder sb = new StringBuilder();  
	    int[] tempData = unAray(list);  
	    int[] hashedData = new int[bitCount];  
	    //首先将向量转为二进制字符串  
	    for (int i = 0; i < bitCount; i++) {  
	        hashedData[i] = tempData[hashFamily[0][i]];  
	        sb.append(hashedData[i]);  
	    }  
	      
	    //再用常规hash函数比如MD5对key进行压缩  
	    MessageDigest messageDigest = null;  
	    try   
	    {  
	        messageDigest = MessageDigest.getInstance("MD5");  
	    }  
	    catch (NoSuchAlgorithmException e) {  
	    }  
	  
	    byte[] binary = sb.toString().getBytes();  
	    byte[] hash = messageDigest.digest(binary);  
	    String hashV = MD5Util.bufferToHex(hash);  
	    return hashV;  
	}  
	  
	//将向量映射为LSH中的key,并保存至map中  
	public static void generateHashMap(String id, int[] vercotr) { 
		System.out.println(vercotr.length);
	    for (int j = 0; j < hashCount; j++) {  
	        String key = generateHashKey(vercotr, j);   //桶数
	        //查询mongodb的lsh中是否有改值	        
	        String value=mongoDao.getFileNames(key);
	        if (value!=null&&value!="") {
				value=value+","+id;		
				mongoDao.updataLsh(key, value);
			}else{
				mongoDao.addValue(key, id);
			}	        
	    } 
	}  
	  
	// 查询与输入向量最接近的向量  
	public static Set<String> query(int[] data) {  
	    Set<String> result = new HashSet<String>();  
	    DBCollection coll=MongoCollectionUtil.db.getCollection("lsh");
	    String key = generateHashKey(data);  
	    for (int j = 0; j < hashCount; j++) {  	              
	        BasicDBObject query=new BasicDBObject("filename", j+"-"+key);
			DBCursor cursor= coll.find(query);
			while (cursor.hasNext()) {
				String str=cursor.next().get("value").toString();					
				String[] strs=str.split(",");
				for (int i = 0; i < strs.length; i++) {
					result.add(strs[i]);		
				}						
			}  
	    }  
	    return result;  
	}  	
}


这里我用到了很多mongodb类的操作,大家可以把mongodb的操作理解成为memcache或者数据库的操作,就是一个数据的存储。


你可能感兴趣的:(LSH应用:根据现有数组去mongodb中去数组 然后使用LSH算法计算数组相识度)