java利用classfier4j实现模糊查找、文章摘要、余弦相似度、Tfidf、单词纠正

jar包下载:
https://download.csdn.net/download/dreamzuora/10853888
代码使用:
余弦相似度:

		Double result=cosSimilarityByString("关于王立军,有几个基本事实。首先,1月28日我是初次听到此事,并不相信谷开来会杀人,我跟11·15杀人案无关,我不是谷开来11·15杀人罪的共犯,这个大家都认可",
				"实际上免他是有这些原因的,绝不只是一个谷开来的原因。这是多因一果。");
		System.out.println(result);

文章摘要:

    	String s="被告人:对? 关于王立军,有几个基本事实.首先,1月28日我是初次听到此事.并不相信谷开来会杀人.我跟11·15杀人案无关.我不是谷开来11·15杀人罪的共犯.这个大家都认可.实际上谷开来3月14日她在北京被抓走!" +
    			"在这之前她一直非常确切地跟我说她没杀人,是王立军诬陷她.我在1月28日和次听到这个事时我不相信她会杀人." +
    			"第二个事实,免王立军的局长.是多个因素.一个,我确实认为他诬陷谷开来.但我并不是想掩盖11·15,我是觉得他人品不好." +
    			"因为谷开来和他是如胶似漆,谷开来对他是言听计从,那王立军也通过与谷开来的交往中打入了我的家庭." +
    			"那现在发生这么严重的事.作为一个起码的人,要讲人格的话,你干吗不找谷开来商量,而跑我这里来说这些话?" +
    			"第二个免他的原因,是他想要挟我.他多次谈他身体不好,打黑压力大,得罪了人." +
    			"其实这是在表功.第三,徐某某给我反映了他有五六条问题.有记录.实际上免他是有这些原因的,绝不只是一个谷开来的原因.这是多因一果.";
    	System.out.println(summarise(s, 3));

拼写纠正:

		System.out.println(correct("speling"));
		System.out.println(correct("love"));
		System.out.println(correct("korrecter"));
		System.out.println(correct("korrect"));
		System.out.println(correct("qove"));

TFIDF:

    	System.out.println("tf--------------------------------------");
    	Map> allTfMap=TfIdfAlgorithm.allTf("d://dir");
    	Set fileList=allTfMap.keySet();
      	for(String filePath : fileList){
     		Map tfMap=allTfMap.get(filePath);
     		Set words=tfMap.keySet();
     		for(String word: words){
     			System.out.println("fileName:"+filePath+"     word:"+word+"      tf:"+tfMap.get(word));
     		}
     	}
      	
      	System.out.println("idf--------------------------------------");
    	Map idfMap=TfIdfAlgorithm.idf(allSegsMap);
    	Set words=idfMap.keySet();
      	for(String word : words){
     		System.out.println("word:"+word+"     tf:"+idfMap.get(word));
     	}
    	
      	System.out.println("tf-idf--------------------------------------");
      	Map> tfIdfMap=TfIdfAlgorithm.tfIdf(allTfMap, idfMap);
      	Set files=tfIdfMap.keySet();
      	for(String filePath : files){
      		Map tfIdf=tfIdfMap.get(filePath);
    		Set segs=tfIdf.keySet();
    		for(String word: segs){
    			System.out.println("fileName:"+filePath+"     word:"+word+"        tf-idf:"+tfIdf.get(word));
    		}
      	}

模糊查找:

		WordTrie w = new WordTrie();
		System.out.println("--------");
		WordTrie trie=new WordTrie();
		trie.addWord("we");
		trie.addWord("wei");
		trie.addWord("web");
		trie.addWord("weijie");
		trie.addWord("abi");
		trie.addWord("ai");
		trie.addWord("aqi");
		trie.addWord("biiiyou");
		trie.addWord("dqdi");
		trie.addWord("ji");
		trie.addWord("li");
		trie.addWord("liqing");
		trie.addWord("liqq");
		trie.addWord("liqqq");
		trie.addWord("qi");
		trie.addWord("qibi");
		trie.addWord("i");
		trie.addWord("ibiyzbi");
		//全部模糊匹配
		List list=trie.prefixSearchWord("we");
		for(String s: list){
			System.out.println(s);
		}
		System.out.println("----------------------------------------");
		List li=trie.searchWord("i");
		for(String s: li){
			System.out.println(s);
		}
		System.out.println("----------------------------------------");
		List words=trie.searchWord("bi");
		for(String s: words){
			System.out.println(s);
		}
		
		System.out.println("----------------------------------------");
		List lst=trie.searchWord("q");
		for(String s: lst){
			System.out.println(s);
		}

由于Class4j包只能提取英文文章摘要,因此我们需要更改源码,
SimpleChineseSummariserAlgorithm.java:

package test;
 
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
 
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
 
 
 
/**
 * 
 * 

Title:

*

Description: SimpleSummariser *

* @createDate:2013-8-26 * @author xq * @version 1.0 */ public class SimpleChineseSummariserAlgorithm { /** * * @Title: summarise * @Description: 文章摘要实现 * @param @param input * @param @param numSentences * @param @return * @return String * @throws */ public static String summarise(String input, int numSentences, String split, String stop_sentence) { // get the frequency of each word in the input //得到文章的分词结果,并统计词在文章中出现的次数 Map wordFrequencies = segStr(input); // now create a set of the X most frequent words //统计高频词得到前一百个从大到小进行排序 Set mostFrequentWords = getMostFrequentWords(100, wordFrequencies).keySet(); // break the input up into sentences // workingSentences is used for the analysis, but // actualSentences is used in the results so that the // capitalisation will be correct. // 把段落按. ! ?分隔成句组 String[] workingSentences = getSentences(input.toLowerCase(), split); String[] actualSentences = getSentences(input, split); // iterate over the most frequent words, and add the first sentence // that includes each word to the result Set outputSentences = new LinkedHashSet(); // 遍历每个词 Iterator it = mostFrequentWords.iterator(); while (it.hasNext()) { String word = (String) it.next(); System.out.println("word: " + word); for (int i = 0; i < workingSentences.length; i++) { if (workingSentences[i].indexOf(word) >= 0) { outputSentences.add(actualSentences[i]); break; } if (outputSentences.size() >= numSentences) { break; } } if (outputSentences.size() >= numSentences) { break; } } List reorderedOutputSentences = reorderSentences(outputSentences, input); StringBuffer result = new StringBuffer(""); it = reorderedOutputSentences.iterator(); while (it.hasNext()) { String sentence = (String) it.next(); result.append(sentence); result.append(stop_sentence); // This isn't always correct - perhaps it should be whatever symbol the sentence finished with if (it.hasNext()) { result.append(" "); } } return result.toString(); } /** * * @Title: reorderSentences * @Description: 将句子按顺序输出 * @param @param outputSentences * @param @param input * @param @return * @return List * @throws */ private static List reorderSentences(Set outputSentences, final String input) { // reorder the sentences to the order they were in the // original text ArrayList result = new ArrayList(outputSentences); Collections.sort(result, new Comparator() { public int compare(String arg0, String arg1) { String sentence1 = (String) arg0; String sentence2 = (String) arg1; int indexOfSentence1 = input.indexOf(sentence1.trim()); int indexOfSentence2 = input.indexOf(sentence2.trim()); int result = indexOfSentence1 - indexOfSentence2; return result; } }); return result; } /** * * @Title: getMostFrequentWords * @Description: 对分词进行按数量排序,取出前num个 * @param @param num * @param @param words * @param @return * @return Map * @throws */ public static Map getMostFrequentWords(int num,Map words){ Map keywords = new LinkedHashMap(); int count=0; // 词频统计 List> info = new ArrayList>(words.entrySet()); Collections.sort(info, new Comparator>() { public int compare(Map.Entry obj1, Map.Entry obj2) { return obj2.getValue() - obj1.getValue(); } }); // 高频词输出 for (int j = 0; j < info.size(); j++) { // 词-->频 // 去掉长度为1 if(info.get(j).getKey().length()>1){ if(num>count){ keywords.put(info.get(j).getKey(), info.get(j).getValue()); count++; }else{ break; } } } return keywords; } /** * 统计出现每个关键字出现的次数 * @Title: segStr * @Description: 返回LinkedHashMap的分词 * @param @param content * @param @return * @return Map * @throws */ public static Map segStr(String content){ // 分词 Reader input = new StringReader(content); // 智能分词关闭(对分词的精度影响很大) IKSegmenter iks = new IKSegmenter(input, true); Lexeme lexeme = null; // LinkedHashMap:按照顺序存储到HashMap中 Map words = new LinkedHashMap(); try { while ((lexeme = iks.next()) != null) { if (words.containsKey(lexeme.getLexemeText())) { words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1); } else { words.put(lexeme.getLexemeText(), 1); } } }catch(IOException e) { e.printStackTrace(); } return words; } /** * 拆分段落 * @Title: getSentences * @Description: 把段落按. ! ?分隔成句组 * @param @param input * @param @return * @return String[] * @throws */ public static String[] getSentences(String input, String split) { if (input == null) { return new String[0]; } else { // split on a ".", a "!", a "?" followed by a space or EOL //"(\\.|!|\\?)+(\\s|\\z)" return input.split(split); } } public static void main(String[] args) throws Exception{ System.out.println("------"); String ssss="被告人:对? 关于王立军,有几个基本事实.首先,1月28日我是初次听到此事.并不相信谷开来会杀人.我跟11·15杀人案无关.我不是谷开来11·15杀人罪的共犯.这个大家都认可.实际上谷开来3月14日她在北京被抓走!" + "在这之前她一直非常确切地跟我说她没杀人,是王立军诬陷她.我在1月28日和次听到这个事时我不相信她会杀人." + "第二个事实,免王立军的局长.是多个因素.一个,我确实认为他诬陷谷开来.但我并不是想掩盖11·15,我是觉得他人品不好." + "因为谷开来和他是如胶似漆,谷开来对他是言听计从,那王立军也通过与谷开来的交往中打入了我的家庭." + "那现在发生这么严重的事.作为一个起码的人,要讲人格的话,你干吗不找谷开来商量,而跑我这里来说这些话?" + "第二个免他的原因,是他想要挟我.他多次谈他身体不好,打黑压力大,得罪了人." + "其实这是在表功.第三,徐某某给我反映了他有五六条问题.有记录.实际上免他是有这些原因的,绝不只是一个谷开来的原因.这是多因一果."; String split = "[.]|[?]"; String stop = "。"; System.out.println(summarise(ssss, 100, split, stop)); System.out.println("-------"); String url = "http://www.xinhuanet.com/politics/leaders/2018-12/14/c_1123856633.htm"; //获取正文 News news = ContentExtractor.getNewsByUrl(url); String s = news.getContent(); System.out.println("title: " + news.getTitle()); s = s.replace("。", "."); System.out.println("文章摘要: "); String[] ss = summarise(s, 100, split, stop).split("[.]"); // System.out.println(summarise(s, 1)); for(String str : ss) { System.out.println(str); } } }

你可能感兴趣的:(菜鸟的机器学习入门之路,java机器学习)