使用 HanLP - 汉语言处理包来比较两个文本的相似度

步骤:
1.抽取关键字
2.合并分词结果,列出所有的词
3.计算词频
4.计算余弦值–>相似度

1.抽取关键字[抽取5个关键字]

private static List<String> getSplitWords(String sentence) {
       return HanLP.extractKeyword(sentence, 5);
}

2.合并分词结果,列出所有的词

private static List<String> mergeList(List<String> list1, List<String> list2) {
        List<String> result = new ArrayList<String>();
        result.addAll(list1);
        result.addAll(list2);
        System.out.println(result);
        //去掉重复元素
        for  ( int  i  =   0 ; i  <  result.size()  -   1 ; i ++ )  {       
            for  ( int  j  =  result.size()  -   1 ; j  >  i; j -- )  {       
                 if  (result.get(j).equals(result.get(i)))  {       
                	 result.remove(j);       
                  }        
              }        
            }     
        System.out.println(result);
        return result;
        //return result.stream().distinct().collect(Collectors.toList());
 }

3.计算词频;allWords是合并分词得到的词,sentWords是抽取关键字的结果

 private static int[] statistic(List<String> allWords, List<String> sentWords) {
       int[] result = new int[allWords.size()];
       for (int i = 0; i < allWords.size(); i++) {
           result[i] = Collections.frequency(sentWords, allWords.get(i));
       }
       return result;
   }

4.计算余弦值

 	  //计算余弦值
public static double getSimilarity(int[] statistic1, int[] statistic2) {
       double dividend = 0;
       double divisor1 = 0;
       double divisor2 = 0;
       for (int i = 0; i < statistic1.length; i++) {
           dividend += statistic1[i] * statistic2[i];
           divisor1 += Math.pow(statistic1[i], 2);
           divisor2 += Math.pow(statistic2[i], 2);
       }

       return dividend / (Math.sqrt(divisor1) * Math.sqrt(divisor2));
 }
 //转换为百分比
public String toNumber(double num){
	 	NumberFormat number = NumberFormat.getPercentInstance();
	 	number.setMaximumIntegerDigits(3);
	 	number.setMaximumFractionDigits(2);
	 	return number.format(num);
 }

5.测试

public static void main(String[] args) throws IOException {
	    String content1 = "我喜欢看电视,不喜欢看电影。";
		String content2 = "我喜欢吃饭。";
		//获取关键词
		List<String> keywordList1 = getSplitWords(content1);
		List<String> keywordList2 = getSplitWords(content2);
		System.out.println(keywordList1);
		System.out.println(keywordList2);
		
		//合并关键词
		List<String> total = mergeList(keywordList1, keywordList2);
		//计算词频
		int[] list1 = statistic(total, keywordList1);
		int[] list2 = statistic(total, keywordList2);
		System.out.print("[");
		for (int i = 0; i < list1.length; i++) {
			System.out.print(list1[i]+",");
		}
		System.out.print("]");System.out.println();
		System.out.print("[");
		for (int i = 0; i < list2.length; i++) {
			System.out.print(list2[i]+",");
		}
		System.out.print("]");
		//计算余弦值
		System.out.println();
		Double num = getSimilarity(list1, list2);
		System.out.println("余弦值:"+num);
		
		//转换为百分比
		NumberFormat number = NumberFormat.getPercentInstance();
		number.setMaximumIntegerDigits(3);
		number.setMaximumFractionDigits(2);
		System.out.println("百分比:"+number.format(num));
}

你可能感兴趣的:(Hanlp分词)