jar包下载:
https://download.csdn.net/download/dreamzuora/10853888
代码使用:
余弦相似度:
Double result=cosSimilarityByString("关于王立军,有几个基本事实。首先,1月28日我是初次听到此事,并不相信谷开来会杀人,我跟11·15杀人案无关,我不是谷开来11·15杀人罪的共犯,这个大家都认可",
"实际上免他是有这些原因的,绝不只是一个谷开来的原因。这是多因一果。");
System.out.println(result);
文章摘要:
String s="被告人:对? 关于王立军,有几个基本事实.首先,1月28日我是初次听到此事.并不相信谷开来会杀人.我跟11·15杀人案无关.我不是谷开来11·15杀人罪的共犯.这个大家都认可.实际上谷开来3月14日她在北京被抓走!" +
"在这之前她一直非常确切地跟我说她没杀人,是王立军诬陷她.我在1月28日和次听到这个事时我不相信她会杀人." +
"第二个事实,免王立军的局长.是多个因素.一个,我确实认为他诬陷谷开来.但我并不是想掩盖11·15,我是觉得他人品不好." +
"因为谷开来和他是如胶似漆,谷开来对他是言听计从,那王立军也通过与谷开来的交往中打入了我的家庭." +
"那现在发生这么严重的事.作为一个起码的人,要讲人格的话,你干吗不找谷开来商量,而跑我这里来说这些话?" +
"第二个免他的原因,是他想要挟我.他多次谈他身体不好,打黑压力大,得罪了人." +
"其实这是在表功.第三,徐某某给我反映了他有五六条问题.有记录.实际上免他是有这些原因的,绝不只是一个谷开来的原因.这是多因一果.";
System.out.println(summarise(s, 3));
拼写纠正:
System.out.println(correct("speling"));
System.out.println(correct("love"));
System.out.println(correct("korrecter"));
System.out.println(correct("korrect"));
System.out.println(correct("qove"));
TFIDF:
System.out.println("tf--------------------------------------");
Map> allTfMap=TfIdfAlgorithm.allTf("d://dir");
Set fileList=allTfMap.keySet();
for(String filePath : fileList){
Map tfMap=allTfMap.get(filePath);
Set words=tfMap.keySet();
for(String word: words){
System.out.println("fileName:"+filePath+" word:"+word+" tf:"+tfMap.get(word));
}
}
System.out.println("idf--------------------------------------");
Map idfMap=TfIdfAlgorithm.idf(allSegsMap);
Set words=idfMap.keySet();
for(String word : words){
System.out.println("word:"+word+" tf:"+idfMap.get(word));
}
System.out.println("tf-idf--------------------------------------");
Map> tfIdfMap=TfIdfAlgorithm.tfIdf(allTfMap, idfMap);
Set files=tfIdfMap.keySet();
for(String filePath : files){
Map tfIdf=tfIdfMap.get(filePath);
Set segs=tfIdf.keySet();
for(String word: segs){
System.out.println("fileName:"+filePath+" word:"+word+" tf-idf:"+tfIdf.get(word));
}
}
模糊查找:
WordTrie w = new WordTrie();
System.out.println("--------");
WordTrie trie=new WordTrie();
trie.addWord("we");
trie.addWord("wei");
trie.addWord("web");
trie.addWord("weijie");
trie.addWord("abi");
trie.addWord("ai");
trie.addWord("aqi");
trie.addWord("biiiyou");
trie.addWord("dqdi");
trie.addWord("ji");
trie.addWord("li");
trie.addWord("liqing");
trie.addWord("liqq");
trie.addWord("liqqq");
trie.addWord("qi");
trie.addWord("qibi");
trie.addWord("i");
trie.addWord("ibiyzbi");
//全部模糊匹配
List list=trie.prefixSearchWord("we");
for(String s: list){
System.out.println(s);
}
System.out.println("----------------------------------------");
List li=trie.searchWord("i");
for(String s: li){
System.out.println(s);
}
System.out.println("----------------------------------------");
List words=trie.searchWord("bi");
for(String s: words){
System.out.println(s);
}
System.out.println("----------------------------------------");
List lst=trie.searchWord("q");
for(String s: lst){
System.out.println(s);
}
由于Class4j包只能提取英文文章摘要,因此我们需要更改源码,
SimpleChineseSummariserAlgorithm.java:
package test;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
/**
*
* Title:
* Description: SimpleSummariser
*
* @createDate:2013-8-26
* @author xq
* @version 1.0
*/
public class SimpleChineseSummariserAlgorithm {
/**
*
* @Title: summarise
* @Description: 文章摘要实现
* @param @param input
* @param @param numSentences
* @param @return
* @return String
* @throws
*/
public static String summarise(String input, int numSentences, String split, String stop_sentence) {
// get the frequency of each word in the input
//得到文章的分词结果,并统计词在文章中出现的次数
Map wordFrequencies = segStr(input);
// now create a set of the X most frequent words
//统计高频词得到前一百个从大到小进行排序
Set mostFrequentWords = getMostFrequentWords(100, wordFrequencies).keySet();
// break the input up into sentences
// workingSentences is used for the analysis, but
// actualSentences is used in the results so that the
// capitalisation will be correct.
// 把段落按. ! ?分隔成句组
String[] workingSentences = getSentences(input.toLowerCase(), split);
String[] actualSentences = getSentences(input, split);
// iterate over the most frequent words, and add the first sentence
// that includes each word to the result
Set outputSentences = new LinkedHashSet();
// 遍历每个词
Iterator it = mostFrequentWords.iterator();
while (it.hasNext()) {
String word = (String) it.next();
System.out.println("word: " + word);
for (int i = 0; i < workingSentences.length; i++) {
if (workingSentences[i].indexOf(word) >= 0) {
outputSentences.add(actualSentences[i]);
break;
}
if (outputSentences.size() >= numSentences) {
break;
}
}
if (outputSentences.size() >= numSentences) {
break;
}
}
List reorderedOutputSentences = reorderSentences(outputSentences, input);
StringBuffer result = new StringBuffer("");
it = reorderedOutputSentences.iterator();
while (it.hasNext()) {
String sentence = (String) it.next();
result.append(sentence);
result.append(stop_sentence); // This isn't always correct - perhaps it should be whatever symbol the sentence finished with
if (it.hasNext()) {
result.append(" ");
}
}
return result.toString();
}
/**
*
* @Title: reorderSentences
* @Description: 将句子按顺序输出
* @param @param outputSentences
* @param @param input
* @param @return
* @return List
* @throws
*/
private static List reorderSentences(Set outputSentences, final String input) {
// reorder the sentences to the order they were in the
// original text
ArrayList result = new ArrayList(outputSentences);
Collections.sort(result, new Comparator() {
public int compare(String arg0, String arg1) {
String sentence1 = (String) arg0;
String sentence2 = (String) arg1;
int indexOfSentence1 = input.indexOf(sentence1.trim());
int indexOfSentence2 = input.indexOf(sentence2.trim());
int result = indexOfSentence1 - indexOfSentence2;
return result;
}
});
return result;
}
/**
*
* @Title: getMostFrequentWords
* @Description: 对分词进行按数量排序,取出前num个
* @param @param num
* @param @param words
* @param @return
* @return Map
* @throws
*/
public static Map getMostFrequentWords(int num,Map words){
Map keywords = new LinkedHashMap();
int count=0;
// 词频统计
List> info = new ArrayList>(words.entrySet());
Collections.sort(info, new Comparator>() {
public int compare(Map.Entry obj1, Map.Entry obj2) {
return obj2.getValue() - obj1.getValue();
}
});
// 高频词输出
for (int j = 0; j < info.size(); j++) {
// 词-->频
// 去掉长度为1
if(info.get(j).getKey().length()>1){
if(num>count){
keywords.put(info.get(j).getKey(), info.get(j).getValue());
count++;
}else{
break;
}
}
}
return keywords;
}
/**
* 统计出现每个关键字出现的次数
* @Title: segStr
* @Description: 返回LinkedHashMap的分词
* @param @param content
* @param @return
* @return Map
* @throws
*/
public static Map segStr(String content){
// 分词
Reader input = new StringReader(content);
// 智能分词关闭(对分词的精度影响很大)
IKSegmenter iks = new IKSegmenter(input, true);
Lexeme lexeme = null;
// LinkedHashMap:按照顺序存储到HashMap中
Map words = new LinkedHashMap();
try {
while ((lexeme = iks.next()) != null) {
if (words.containsKey(lexeme.getLexemeText())) {
words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
} else {
words.put(lexeme.getLexemeText(), 1);
}
}
}catch(IOException e) {
e.printStackTrace();
}
return words;
}
/**
* 拆分段落
* @Title: getSentences
* @Description: 把段落按. ! ?分隔成句组
* @param @param input
* @param @return
* @return String[]
* @throws
*/
public static String[] getSentences(String input, String split) {
if (input == null) {
return new String[0];
} else {
// split on a ".", a "!", a "?" followed by a space or EOL
//"(\\.|!|\\?)+(\\s|\\z)"
return input.split(split);
}
}
public static void main(String[] args) throws Exception{
System.out.println("------");
String ssss="被告人:对? 关于王立军,有几个基本事实.首先,1月28日我是初次听到此事.并不相信谷开来会杀人.我跟11·15杀人案无关.我不是谷开来11·15杀人罪的共犯.这个大家都认可.实际上谷开来3月14日她在北京被抓走!" +
"在这之前她一直非常确切地跟我说她没杀人,是王立军诬陷她.我在1月28日和次听到这个事时我不相信她会杀人." +
"第二个事实,免王立军的局长.是多个因素.一个,我确实认为他诬陷谷开来.但我并不是想掩盖11·15,我是觉得他人品不好." +
"因为谷开来和他是如胶似漆,谷开来对他是言听计从,那王立军也通过与谷开来的交往中打入了我的家庭." +
"那现在发生这么严重的事.作为一个起码的人,要讲人格的话,你干吗不找谷开来商量,而跑我这里来说这些话?" +
"第二个免他的原因,是他想要挟我.他多次谈他身体不好,打黑压力大,得罪了人." +
"其实这是在表功.第三,徐某某给我反映了他有五六条问题.有记录.实际上免他是有这些原因的,绝不只是一个谷开来的原因.这是多因一果.";
String split = "[.]|[?]";
String stop = "。";
System.out.println(summarise(ssss, 100, split, stop));
System.out.println("-------");
String url = "http://www.xinhuanet.com/politics/leaders/2018-12/14/c_1123856633.htm";
//获取正文
News news = ContentExtractor.getNewsByUrl(url);
String s = news.getContent();
System.out.println("title: " + news.getTitle());
s = s.replace("。", ".");
System.out.println("文章摘要: ");
String[] ss = summarise(s, 100, split, stop).split("[.]");
// System.out.println(summarise(s, 1));
for(String str : ss) {
System.out.println(str);
}
}
}