TextExtract(2)NLP Basic
1. Basic Introduction
NLP - Natural Language Processing
remove noise, remove the html tag, remove the stop word, stem.
including sentence detector, parts-of-speech(POS) tagger (verbs, nouns or etc), treebank parser
Sentence Detector - return the sentences
Tokenizer - usually word is token, sometimes one word will be 2 tokens. For example don’t will be “do” “n't"
POS Tagger - put the tokens into speech tags( verb, adverb, personal pronoun and etc)
Treebank Chunker - verb phrase and noun. phrase
Treebank Parser -
2. Basic Code Example
Download and get the file apache-opennlp-1.6.0-bin.tar.gz. Place them in the working directory.
> opennlp
OpenNLP 1.6.0. Usage: opennlp TOOL
>opennlp ToolName lang-model-name.bin
>opennlp ToolName lang-model-name.bin < input.txt > output.txt
General Pattern
Build the model on top of xxx.bin file, Build the Tool based on Model, execute the task on the tool, return us an array of strings.
The plane that we can download the models http://opennlp.sourceforge.net/models-1.5/
The pattern is as follow, but I did not see any real examples there.
package com.sillycat.resumeparse;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
public class OpenNLPMain {
public static void main(String[] args) {
InputStream modelIn = OpenNLPMain.class.getClassLoader()
ParserModel model = null;
try {
model = new ParserModel(modelIn);
} catch (IOException e) {
} finally {
if (modelIn != null) {
try {
} catch (IOException e) {
Parser parser = ParserFactory.create(model);
String sentence = "I am carl. I worked in US for about 3 years. Before that I was working in China for 8 years.";
Parse topParses[] = ParserTool.parseLine(sentence, parser, 1);
for (int i = 0 ; i< topParses.length;i++){
System.out.println(i + " " + topParses[i]);
Latest dependency, but I am using the embedded version in TIKA. So it is still 1.5.3 version.
Tokenizer Example
package com.sillycat.resumeparse;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
public class OpenNLPTokenizerMain {
static final String SAMPLE_STR = "I am Carl. I am a software engineer. Totally I worked 12 years. About 9 years in China, 3 years in US.";
public static void main(String[] args) {
InputStream modelIn = OpenNLPParserMain.class.getClassLoader()
TokenizerModel model = null;
try {
model = new TokenizerModel(modelIn);
} catch (IOException e) {
} finally {
if (modelIn != null) {
try {
} catch (IOException e) {
Tokenizer tokenizer = new TokenizerME(model);
String tokens[] = tokenizer.tokenize(SAMPLE_STR);
for (int i = 0 ; i< tokens.length;i++){
System.out.println(i + " " + tokens[i]);
3. Some Useful NLP Tools and Models
package com.sillycat.resumeparse;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.util.Span;
public class OpenNLPSentenceMain {
static final String SAMPLE_STR = "Carl is a Chinese. He worked in China for 9 years. Then he relocated to Austin, Texas, USA. And he spends 3 years there till now.";
public static void main(String[] args) {
InputStream modelIn = OpenNLPParserMain.class.getClassLoader()
SentenceModel model = null;
try {
model = new SentenceModel(modelIn);
} catch (IOException e) {
} finally {
if (modelIn != null) {
try {
} catch (IOException e) {
SentenceDetectorME sentenceDetector = new SentenceDetectorME(model);
Span[] spans = sentenceDetector.sentPosDetect(SAMPLE_STR);
double[] sentenceProbabilities = sentenceDetector
for(int i = 0;i<spans.length; i++){
int start = spans[i].getStart();
int end = spans[i].getEnd();
String value = SAMPLE_STR.substring( start, end );
System.out.println( i + " possibility: " + sentenceProbabilities[i] + " string:" + value);
package com.sillycat.resumeparse;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
public class OpenNLPTokenizerMain {
static final String SAMPLE_STR = "I am Carl. I am a software engineer. Totally I worked 12 years. About 9 years in China, 3 years in US.";
public static void main(String[] args) {
InputStream modelIn = OpenNLPParserMain.class.getClassLoader()
TokenizerModel model = null;
try {
model = new TokenizerModel(modelIn);
} catch (IOException e) {
} finally {
if (modelIn != null) {
try {
} catch (IOException e) {
TokenizerME tokenizer = new TokenizerME(model);
Span[] spans = tokenizer.tokenizePos(SAMPLE_STR);
double[] tokenProbabilities = tokenizer.getTokenProbabilities();
for (int i = 0; i < spans.length; i++) {
int start = spans[i].getStart();
int end = spans[i].getEnd();
String value = SAMPLE_STR.substring(start, end);
System.out.println(i + " possibility: " + tokenProbabilities[i]
+ " string:" + value);
here is the list of the links
( ) [ ] { }
become, in parsed files: -LRB- -RRB- -RSB- -RSB- -LCB- -RCB-
(The acronyms stand for (Left|Right) (Round|Square|Curly) Bracket.)
Here is the codes
package com.sillycat.resumeparse;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
public class OpenNLPPOSMain {
public static void main(String[] args) {
String[] data = new String[]{"Carl","engineer","am","a","totally","worked"};
InputStream modelIn = OpenNLPParserMain.class.getClassLoader()
POSModel model = null;
try {
model = new POSModel(modelIn);
} catch (IOException e) {
} finally {
if (modelIn != null) {
try {
} catch (IOException e) {
POSTaggerME posTagger = new POSTaggerME( model );
String[] tags = posTagger.tag( data );
double[] probs = posTagger.probs();
for ( int i = 0; i < tags.length; i++ )
System.out.println(data[i] + " " + probs[i] + " " + tags[i] );
package com.sillycat.resumeparse;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.util.Span;
public class OpenNLPChunkMain {
public static void main(String[] args) {
InputStream modelIn = OpenNLPParserMain.class.getClassLoader()
ChunkerModel model = null;
try {
model = new ChunkerModel(modelIn);
} catch (IOException e) {
} finally {
if (modelIn != null) {
try {
} catch (IOException e) {
// I 0.9732879282256719 PRP
// am 0.964606681960317 VBP
// Carl 0.9816758912754017 NNP
// . 0.3823051156140692 .
// I 0.95524464076097 PRP
// am 0.9801383116579873 VBP
// a 0.9863774195781929 DT
// software 0.9071380751356256 NN
// engineer 0.9836540552245981 NN
// . 0.985789375461335 .
String[] data = new String[] { "I", "am", "Carl", ".",
"I", "am", "a", "software","engineer", "." };
String[] tags2 = new String[] { "PRP", "VBP", "NNP", ".", "PRP", "VBP", "DT", "NN", "NN", "." };
ChunkerME chunker = new ChunkerME(model);
Span[] spans = chunker.chunkAsSpans(data, tags2);
double[] probs = chunker.probs();
for (int i = 0; i < spans.length; i++) {
int start = spans[i].getStart();
int end = spans[i].getEnd();
StringBuilder buffer = new StringBuilder();
for (int j = start; j < end; j++) {
if (j != (end - 1)) {
buffer.append(' ');
String value = buffer.toString();
System.out.println(probs[i] + " " + value);
The Result is amazing
0.9818474273481409 I
0.9839139471783958 am
0.9503687937291497 Carl
0.6471572589002946 I
0.6740306961591902 am
0.9328973760592183 a software engineer