http://www.zmonster.me/2016/06/08/use-stanford-nlp-package-in-nltk.html
http://stanfordnlp.github.io/CoreNLP/
http://blog.csdn.net/ltbylc/article/details/8557965
-mx1g
import java.util.*;
import edu.stanford.nlp.ie.crf.CRFClassifier;
public class stanfordSeg {
public static String doSegment(String data, CRFClassifier c) {
String[] strs = (String[]) c.segmentString(data).toArray();
StringBuffer buf = new StringBuffer();
for (String s : strs) {
buf.append(s + " ");
}
return buf.toString();
}
public static void main(String[] args) throws Exception {
Properties props = new Properties();
props.setProperty("sighanCorporaDict", "data");
props.setProperty("serDictionary", "data/dict-chris6.ser.gz");
props.setProperty("inputEncoding", "UTF-8");
props.setProperty("sighanPostProcessing", "true");
CRFClassifier classifier = new CRFClassifier(props);
classifier.loadClassifierNoExceptions("data/ctb.gz", props);
classifier.flags.setProperties(props);
String sentence = "某处女同志去吃饭。";
String ret = doSegment(sentence, classifier);
System.out.println(ret);
}
}
java -mx300m -classpath stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTagger
-model models/chinese-distsim.tagger -textFile inputFile > outputFile
import java.util.*;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.List;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
class stanfordSeg {
private stanfordSeg() {}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("usage: java TaggerDemo modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
List> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(args[1])));
for (List sentence : sentences) {
List tSentence = tagger.tagSentence(sentence);
System.out.println(Sentence.listToString(tSentence, false));
}
}
}
http://blog.csdn.net/sparkexpert/article/details/49497231
http://blog.csdn.net/shijiebei2009/article/details/42525091
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreLabel;
/**
*
*
* ClassName ExtractDemo
*
*
* Description 加载NER模块
*
*
* @author wangxu [email protected]
*
* Date 2015年1月8日 下午2:53:45
*
* @version V1.0.0
*
*/
public class stanfordtest{
private static AbstractSequenceClassifier ner;
public stanfordtest() {
InitNer();
}
public void InitNer() {
String serializedClassifier = "classifiers/chinese.misc.distsim.crf.ser.gz"; // chinese.misc.distsim.crf.ser.gz
if (ner == null) {
ner = CRFClassifier.getClassifierNoExceptions(serializedClassifier);
}
}
public String doNer(String sent) {
return ner.classifyWithInlineXML(sent);
}
public static void main(String args[]) {
String str = "我 去 吃饭 , 告诉 李强 一声 。";
stanfordtest extractDemo = new stanfordtest();
System.out.println(extractDemo.doNer(str));
System.out.println("Complete!");
}
}
http://blog.sina.com.cn/s/blog_8af106960101abvu.html
//package com.parser;
import java.util.List;
import java.io.StringReader;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
public class stanfordtest {
private stanfordtest() {} // static methods only
public static void main(String[] args) {
String parserModel = "edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz";
LexicalizedParser lp = LexicalizedParser.loadModel(parserModel);
String sent2 = "他 和 我 在 学校 里 常 打 台球.";
demoAPI(lp,sent2);
}
public static void demoAPI(LexicalizedParser lp,String str) {
TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
Tokenizer tok = tokenizerFactory.getTokenizer(new StringReader(str));
List rawWords2 = tok.tokenize();
Tree parse = lp.apply(rawWords2);
TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List tdl = gs.typedDependenciesCCprocessed();
for(int i = 0; i < tdl.size(); i++){
System.out.println(tdl.get(i));
}
//System.out.println(tdl);
// System.out.println();
// You can also use a TreePrint object to print trees and dependencies
//TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
//tp.printTree(parse);
}
}
import java.util.Collection;
import java.util.List;
import java.io.StringReader;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
class stanfordtest {
/**
* The main method demonstrates the easiest way to load a parser.
* Simply call loadModel and specify the path of a serialized grammar
* model, which can be a file, a resource on the classpath, or even a URL.
* For example, this demonstrates loading a grammar from the models jar
* file, which you therefore need to include on the classpath for ParserDemo
* to work.
*
* Usage: {@code java ParserDemo [[model] textFile]}
* e.g.: java ParserDemo edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz data/chinese-onesent-utf8.txt
*
*/
public static void main(String[] args) {
String parserModel = "edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz";
if (args.length > 0) {
parserModel = args[0];
}
LexicalizedParser lp = LexicalizedParser.loadModel(parserModel);
if (args.length == 0) {
demoAPI(lp);
} else {
String textFile = (args.length > 1) ? args[1] : args[0];
demoDP(lp, textFile);
}
}
/**
* demoDP demonstrates turning a file into tokens and then parse
* trees. Note that the trees are printed by calling pennPrint on
* the Tree object. It is also possible to pass a PrintWriter to
* pennPrint if you want to capture the output.
* This code will work with any supported language.
*/
public static void demoDP(LexicalizedParser lp, String filename) {
// This option shows loading, sentence-segmenting and tokenizing
// a file using DocumentPreprocessor.
TreebankLanguagePack tlp = lp.treebankLanguagePack(); // a PennTreebankLanguagePack for English
GrammaticalStructureFactory gsf = null;
if (tlp.supportsGrammaticalStructures()) {
gsf = tlp.grammaticalStructureFactory();
}
// You could also create a tokenizer here (as below) and pass it
// to DocumentPreprocessor
for (List sentence : new DocumentPreprocessor(filename)) {
Tree parse = lp.apply(sentence);
parse.pennPrint();
System.out.println();
if (gsf != null) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
Collection tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
}
}
}
/**
* demoAPI demonstrates other ways of calling the parser with
* already tokenized text, or in some cases, raw text that needs to
* be tokenized as a single sentence. Output is handled with a
* TreePrint object. Note that the options used when creating the
* TreePrint can determine what results to print out. Once again,
* one can capture the output by passing a PrintWriter to
* TreePrint.printTree. This code is for English.
*/
public static void demoAPI(LexicalizedParser lp) {
// This option shows parsing a list of correctly tokenized words
String[] sent = { "他", "和", "我", "经常", "打", "台球","." };
List rawWords = Sentence.toCoreLabelList(sent);
Tree parse = lp.apply(rawWords);
parse.pennPrint();
System.out.println();
// This option shows loading and using an explicit tokenizer
String sent2 = "This is another sentence.";
TokenizerFactory tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
Tokenizer tok =
tokenizerFactory.getTokenizer(new StringReader(sent2));
List rawWords2 = tok.tokenize();
parse = lp.apply(rawWords2);
TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
// You can also use a TreePrint object to print trees and dependencies
TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
tp.printTree(parse);
}
private stanfordtest() {} // static methods only
}