之前已经提到过使用FudanNLP进行新闻关键词提取,无奈组长不满意于是换成了ictclas,在我的ubuntu13.04上面ictclas跑得很好,可惜到别人的机器上就报错,没办法,只好再一次换工具,经过晓阳童鞋推荐,这次换成了ANSJ,据说这个工具就是早先ictclas的JAVA版本。
这个工具的Github地址是这里:https://github.com/ansjsun/ansj_seg 需要看源码的自己前往。下面说说如何使用这个工具提取关键词。
1-下载JAR包
显然下载JAR包是最省事的方法,当然你也可以把Github上面的工程加进来,无奈JAR包找了好久未找到,最后只好向作者索取,目前已经上传到CSDN了,欢迎下载:http://download.csdn.net/detail/jj12345jj198999/6020541
2-自定义keyword类
虽然ANSJ中已经实现了关键词提取,不过输出时并没有给出每一个关键词的权重,于是只好手动修改keyword类,好在Github上面的源码中已经定义了权重成员,我们只需要增加一个Get函数即可。
public class Keyword implements Comparable<Keyword> { private String name; private double score; private double idf; private int freq; public Keyword(String name, int docFreq, int weight) { this.name = name; this.idf = Math.log(10000 + 10000.0 / (docFreq + 1)); this.score = idf * weight; freq++; } public void updateWeight(int weight) { this.score += weight * idf; freq++; } public int compareTo(Keyword o) { if (this.score < o.score) { return 1; } else { return -1; } } public boolean equals(Object obj) { // TODO Auto-generated method stub if (obj instanceof Keyword) { Keyword k = (Keyword) obj; return k.name.equals(name); } else { return false; } } public String toString() { return name; } //look here ****************************************************** public double getScore(){ return score; } public int getFreq() { return freq; } }
import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeSet; import org.ansj.app.newWord.LearnTool; import org.ansj.domain.Term; import org.ansj.recognition.NatureRecognition; import org.ansj.splitWord.analysis.NlpAnalysis; public class KeyWordComputer { private int nKeyword = 10; //default constructor keyword number=10 public KeyWordComputer() { nKeyword = 10; } // constructor set keyword number public KeyWordComputer(int nKeyword) { this.nKeyword = nKeyword; } //get keywords object list private List<Keyword> computeArticleTfidf(String content, int titleLength) { Map<String, Keyword> tm = new HashMap<String, Keyword>(); LearnTool learn = new LearnTool(); List<Term> parse = NlpAnalysis.parse(content, learn); parse = NlpAnalysis.parse(content, learn); for (Term term : parse) { int weight = getWeight(term, content.length(), titleLength); if (weight == 0) continue; Keyword keyword = tm.get(term.getName()); if (keyword == null) { keyword = new Keyword(term.getName(), term.getNatrue().allFrequency, weight); tm.put(term.getName(), keyword); } else { keyword.updateWeight(1); } } TreeSet<Keyword> treeSet = new TreeSet<Keyword>(tm.values()); ArrayList<Keyword> arrayList = new ArrayList<Keyword>(treeSet); if (treeSet.size() < nKeyword) { return arrayList; } else { return arrayList.subList(0, nKeyword); } } //get keywords,need title and content public Collection<Keyword> computeArticleTfidf(String title, String content) { return computeArticleTfidf(title + "\t" + content, title.length()); } //get keywords, just need content public Collection<Keyword> computeArticleTfidf(String content) { return computeArticleTfidf(content, 0); } //get keywords weight private int getWeight(Term term, int length, int titleLength) { if (term.getName().matches("(?s)\\d.*")) { return 0; } if (term.getName().trim().length() < 2) { return 0; } String pos = term.getNatrue().natureStr; if (!pos.startsWith("n") || "num".equals(pos)) { return 0; } int weight = 0; if (titleLength > term.getOffe()) { return 20; } // position double position = (term.getOffe() + 0.0) / length; if (position < 0.05) return 10; weight += (5 - 5 * position); return weight; } }
3-test
最后就是写一个类测试,import一堆东西
import java.util.List; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.StringReader; import java.util.Iterator; import org.ansj.app.newWord.LearnTool; import org.ansj.domain.Term; import org.ansj.recognition.NatureRecognition; import org.ansj.splitWord.Analysis; import org.ansj.splitWord.analysis.NlpAnalysis; import org.ansj.splitWord.analysis.ToAnalysis; import org.ansj.util.*; import org.ansj.recognition.*; public class test { public static void main(String[] args) throws IOException { String filePath = "./test-utf8.TXT"; String tt=new String(); BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF8")); String str; while ((str = in.readLine()) != null) { tt+=str; } test1(tt); System.out.println("*************************"); filePath = "./test1.txt"; BufferedReader in2 = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF8")); String str2; String tt2=new String(); while ((str2 = in2.readLine()) != null) { tt2+=str2; } test1(tt2); } public static void test1(String content){ KeyWordComputer key=new KeyWordComputer(10); Iterator it = key.computeArticleTfidf(content).iterator() ; while(it.hasNext()) { Keyword key2=(Keyword)it.next(); System.out.println(key2.toString()+key2.getScore()); } } }
init ambiguity waring :library/ambiguity.dic because : not find that file or can not to read ! init core library ok use time :3983 init ngram ok use time :2023 屌丝528.8693014046396 李毅202.62858475668492 网络174.9965471938941 球迷110.52413506982782 群体110.52413506982782 人人110.52413506982782 名号101.31379048067551 高富帅92.10390216212956 满屏92.10390216212954 网友92.1034458915232 ************************* 社会主义1446.0241004969153 社会1326.289620837935 中国1096.0347881537828 人民1049.9792831633652 文化874.9827359694709 经济874.9827359694709 特色847.3517022020139 制度801.2999792562523 体系746.0379117213383 国家598.6723982949011