字符串相似度匹配算法_Jaccard算法优化

package day0321.day0330;

import java.util.HashSet;
import java.util.Set;

public class JaccardSimilarity {

    public static double calculateJaccardSimilarity(String str1, String[] strArray) {
        // 将字符串视为字符的集合
        Set<Character> set1 = new HashSet<>();
        Set<Character> set2 = new HashSet<>();

        for (char c : str1.toCharArray()) {
            set1.add(c);
        }

        // 初始化最高相似度为0
        double maxSimilarity = 0;
        String mostSimilarString = "";

        // 遍历数组中的每个字符串,计算Jaccard相似度
        for (String str2 : strArray) {
            set2.clear(); // 清空set2,准备计算下一个字符串的集合

            for (char c : str2.toCharArray()) {
                set2.add(c);
            }

            // 计算交集
            Set<Character> intersection = new HashSet<>(set1);
            intersection.retainAll(set2);

            // 计算并集
            Set<Character> union = new HashSet<>(set1);
            union.addAll(set2);

            // 计算Jaccard相似度
            double jaccardSimilarity = (double) intersection.size() / union.size() * 0.7 + calculateCoefficient(str1.length(),str2.length()) * 0.3 ;

            // 更新最高相似度及相似字符串
            if (jaccardSimilarity > maxSimilarity) {
                maxSimilarity = jaccardSimilarity;
                mostSimilarString = str2;
            }
        }

        // 返回结果
        System.out.println("Jaccard Similarity: " + maxSimilarity);
        System.out.println("Most Similar String: " + mostSimilarString);
        return maxSimilarity;
    }

    public static double calculateCoefficient(int strLen, int strLen2) {
        // 直接使用差值的绝对值作为系数
        double coefficient = 1.0 / (1.0 + Math.abs(strLen - strLen2));

        return coefficient;
    }


    public static void main(String[] args) {
        // 示例用法
        String str1 = "BH44+ BH44";
        String[] strArray = { "BK54+BK55", "BH45+", "BH44+","BH44+BH45", "world"};

        double similarity = calculateJaccardSimilarity(str1, strArray);
    }
}

你可能感兴趣的:(算法,java)