利用OpenNLP进行人名命名实体识别,代码来源于《驾驭文本》第五章。
import java.io.File; import java.io.FileInputStream; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.util.Span; public class NamedEntityExtraction { public static void main(String[] args) throws Exception { String[] sentences = { "Former first lady Nancy Reagan was taken to a " + "suburban Los Angeles " + "hospital \"as a precaution\" Sunday after a fall at " + "her home, an " + "aide said. ", "The 86-year-old Reagan will remain overnight for " + "observation at a hospital in Santa Monica, California, " + "said Joanne " + "Drake, chief of staff for the Reagan Foundation." };//两句话 NameFinderME finder = new NameFinderME(new TokenNameFinderModel( new FileInputStream(new File(System.getProperty("model.dir"), "./nlpbin/en-ner-person.bin"))));//在http://opennlp.sourceforge.net/models-1.5/ 下载en-ner-person.bin Tokenizer tokenizer = SimpleTokenizer.INSTANCE;//初始化简单切词(也就是按空格切词) for (int si = 0; si < sentences.length; si++) { String[] tokens = tokenizer.tokenize(sentences[si]);//第一句话[Former, first, lady, Nancy, Reagan, was, taken, to, a, suburban, Los, Angeles, hospital, ", as, a, precaution, ", Sunday, after, a, fall, at, her, home, ,, an, aide, said, .] //第二句话[The, 86, -, year, -, old, Reagan, will, remain, overnight, for, observation, at, a, hospital, in, Santa, Monica, ,, California, ,, said, Joanne, Drake, ,, chief, of, staff, for, the, Reagan, Foundation, .] Span[] names = finder.find(tokens); displayNames(names, tokens); } finder.clearAdaptiveData(); } public static void displayNames(Span[] names, String[] tokens) { for (int si = 0; si < names.length; si++) { StringBuilder cb = new StringBuilder(); for (int ti = names[si].getStart(); ti < names[si].getEnd(); ti++) { cb.append(tokens[ti]).append(" "); } System.out.println(cb.substring(0, cb.length() - 1)); System.out.print("\tstart: " + names[si].getStart()); System.out.println("\tend: " + names[si].getEnd());//输出实体的起止位置 System.out.println("\ttype: " + names[si].getType());//输出实体的类型type System.out.println("\tprob: " + names[si].getProb());//输出实体的概率prob } } }
span包含了这么几个属性:start(该实体的起始位置),end(该实体的终止位置),prob(可能性,实体的概率),还有type(实体的类型)。
输出结果如下:
Nancy Reagan start: 3 end: 5 type: person prob: 0.9704748832886989 Reagan start: 6 end: 7 type: person prob: 0.9996172457889334 Joanne Drake start: 22 end: 24 type: person prob: 0.9929295961937021 Reagan start: 30 end: 31 type: person prob: 0.9976318413669909