使用余弦相似性原理计算文本的相似度

原理参考: http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html
/**
 * 
 */
package com.text;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

/**
 * @author Riching
 * 
 * @date 2013-8-10
 */
public class IKMainTest {

    /**
     * @param args
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
        String str1 = "我喜欢看电视,不喜欢看电影。";
        String str2 = "我不喜欢看电视,也不喜欢看电影。";
        Map<String, Integer> tf1 = getTF(str1);
        Map<String, Integer> tf2 = getTF(str2);
        Map<String, MutablePair<Integer, Integer>> tfs = new HashMap<String, MutablePair<Integer, Integer>>();
        for (String key : tf1.keySet()) {
            MutablePair<Integer, Integer> pair = new MutablePair<Integer, Integer>(tf1.get(key), 0);
            tfs.put(key, pair);
        }
        for (String key : tf2.keySet()) {
            MutablePair<Integer, Integer> pair = tfs.get(key);
            if (null == pair) {
                pair = new MutablePair<Integer, Integer>(0, tf2.get(key));
            } else {
                pair.setRight(tf2.get(key));
            }
        }
        double d = caclIDF(tfs);
        System.out.println(d);
    }

    public static Map<String, Integer> getTF(String str) throws IOException {
        Map<String, Integer> map = new HashMap<String, Integer>();
        IKSegmenter ikSegmenter = new IKSegmenter(new StringReader(str), true);
        Lexeme lexeme = null;
        while ((lexeme = ikSegmenter.next()) != null) {
            String key = lexeme.getLexemeText();
            Integer count = map.get(key);
            if (null == count) {
                count = 1;
            } else {
                count = count + 1;
            }
            map.put(key, count);
        }
        return map;
    }

    public static double caclIDF(Map<String, MutablePair<Integer, Integer>> tf) {
        double d = 0;
        if (MapUtils.isEmpty(tf)) {
            return d;
        }
        double denominator = 0;
        double sqdoc1 = 0;
        double sqdoc2 = 0;
        Pair<Integer, Integer> count = null;
        for (String key : tf.keySet()) {
            count = tf.get(key);
            denominator += count.getLeft() * count.getRight();
            sqdoc1 += count.getLeft() * count.getLeft();
            sqdoc2 += count.getRight() * count.getRight();
        }
        d = denominator / (Math.sqrt(sqdoc1) * Math.sqrt(sqdoc2));
        return d;
    }
}

你可能感兴趣的:(相似度)