哈工大的ltp机器学习python版本很多,现在用java版本实现以下,网上有很多的方法是需要自己编译,我这边编译完成了
model下载:
链接: https://pan.baidu.com/s/1HDaZpsrPHDcu8P15ho41VQ 提取码: dw9x
ltp编译后的文件:
https://download.csdn.net/download/qq_16613311/12489534
如果不想打赏积分也可以参考进行自己编译:
http://codepub.cn/2015/05/07/Compile-the-Language-Technology-Platform(C++)-and-LTP4J(Java)source-code/
下边是我写的相关类:
package com.dbapp.database.scanning.util;
import edu.hit.ir.ltp4j.NER;
import edu.hit.ir.ltp4j.Postagger;
import edu.hit.ir.ltp4j.Segmentor;
import lombok.extern.slf4j.Slf4j;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;
/**
* @ClassName: ltp4jUtil
* 哈工大ltp机器学习引入
* @Author: yongtao.ding on 2020/5/25 16:41
*/
@Slf4j
public class Ltp4jUtil {
//中文地名
public static String ADDRESS_C="ns";
//地名
public static String ADDRESS_L="nl";
//人名
public static String PERSONNAME_C="nh";
//机构团体名
public static String GROUPS_G="ni";
//公检法
public static String GOVERNMENT="j";
/**
* 判断一句话中是否存在地名,先使用segmentor将内容分词,使用postagger判断是否为地名
* @param postagger
* @param segmentor
* @param content 内容
* @return
*/
public static Boolean isAddress(Postagger postagger, Segmentor segmentor,String content){
List words = new ArrayList();
segmentor.segment(content, words);
List postags = new ArrayList();
int size = postagger.postag(words,postags);
boolean isAddress=false;
for (int i = 0; i < size; i++) {
if (postags.get(i).equals(ADDRESS_C)||postags.get(i).equals(ADDRESS_L)){
isAddress=true;
break;
}
}
return isAddress;
}
/**
* 判断一句话中是否存在企业团体,先使用segmentor将内容分词,使用postagger判断是否为企业团体
* @param postagger
* @param segmentor
* @param content 内容
* @return
*/
public static Boolean isCorporateGroups(Postagger postagger, Segmentor segmentor,NER ner,String content){
Boolean isGroups=false;
List words = new ArrayList();
List tags = new ArrayList();
List ners = new ArrayList();
segmentor.segment(content, words);
List postags = new ArrayList();
int size = postagger.postag(words,postags);
for (int i = 0; i < size; i++) {
tags.add(postags.get(i));
}
ner.recognize(words, tags, ners);
for (int i = 0; i < words.size(); i++) {
System.out.println(words.get(i)+"/"+ners.get(i));
}
if (ners.contains("E-Ni")){
isGroups=true;
return isGroups;
}
return isGroups;
}
/**
* 判断一句话中是否存在公检法,先使用segmentor将内容分词,使用postagger判断是否为公检法
* @param postagger
* @param segmentor
* @param content 内容
* @return
*/
public static Boolean isAbbreviation(Postagger postagger, Segmentor segmentor,String content){
// List words = new ArrayList();
// segmentor.segment(content, words);
List list = new ArrayList<>();
list.add(content);
List postags = new ArrayList();
int size = postagger.postag(list,postags);
boolean isGovernment=false;
for (int i = 0; i < size; i++) {
if (postags.get(i).equals(GOVERNMENT)){
isGovernment=true;
break;
}
}
return isGovernment;
}
/**
* 判断一句话中是否存在人名,先使用segmentor将内容分词,使用postagger判断是否为人名
* @param postagger
* @param segmentor
* @param content 内容
* @return
*/
public static Boolean isPersonName(Postagger postagger, Segmentor segmentor,String content){
List words = new ArrayList();
segmentor.segment(content, words);
List postags = new ArrayList();
int size = postagger.postag(words,postags);
boolean isPersonName=false;
for (int i = 0; i < size; i++) {
if (postags.get(i).equals(PERSONNAME_C)){
isPersonName=true;
break;
}
}
return isPersonName;
}
/**
* 识别地名,返回地名所占比例
* @param words 字段内容
* @return
*/
public static double identificationAddress( Postagger postagger,Segmentor segmentor,List words){
int n=0;
for (int i = 0; i < words.size(); i++) {
String s = words.get(i);
Boolean address = isAddress(postagger, segmentor, s);
if (address){
n+=1;
}
}
double proportion = new BigDecimal((float)n/words.size()).setScale(2, BigDecimal.ROUND_HALF_UP).doubleValue();
return proportion;
}
/**
* 识别人名,返回人名所占比例
* @param words 字段内容
* @return
*/
public static double identificationPersonName( Postagger postagger,Segmentor segmentor,List words){
int n=0;
for (int i = 0; i < words.size(); i++) {
String s = words.get(i);
Boolean address = isPersonName(postagger, segmentor, s);
if (address){
n+=1;
}
}
double proportion = new BigDecimal((float)n/words.size()).setScale(2, BigDecimal.ROUND_HALF_UP).doubleValue();
return proportion;
}
/**
* 识别企业团体,返回企业团体所占比例
* @param words 字段内容
* @return
*/
public static double identificationGroups( Postagger postagger,Segmentor segmentor,NER ner,List words){
int n=0;
for (int i = 0; i < words.size(); i++) {
String s = words.get(i);
Boolean groups = isCorporateGroups(postagger, segmentor, ner,s);
if (groups){
n+=1;
}
}
double proportion = new BigDecimal((float)n/words.size()).setScale(2, BigDecimal.ROUND_HALF_UP).doubleValue();
return proportion;
}
/**
* 识别公检法,返回公检法所占比例
* @param words 字段内容
* @return
*/
public static double identificationGovernment( Postagger postagger,Segmentor segmentor,List words){
int n=0;
for (int i = 0; i < words.size(); i++) {
String s = words.get(i);
Boolean groups = isAbbreviation(postagger, segmentor, s);
if (groups){
n+=1;
}
}
double proportion = new BigDecimal((float)n/words.size()).setScale(2, BigDecimal.ROUND_HALF_UP).doubleValue();
return proportion;
}
public static void main(String[] args) {
/*System.load("D:\\test\\ltp\\ltp4j-0.1.0-SNAPSHOT.dll");
Postagger postagger = new Postagger();
Segmentor segmentor = new Segmentor();
if (postagger.create("D:\\test\\ltp\\pos.model") < 0) {
log.error("pos.model加载失败");
}
if (segmentor.create("D:\\test\\ltp\\cws.model") < 0) {
log.error("load cws.model failed");
}
List words = new ArrayList();
words.add("上海澜海实业有限公司");
words.add("上海丰临进出口有限公司");
words.add("上海宝钢国际经济贸易有限公司");
words.add("上海诚齐机械制造有限公司");
double identificationName = Ltp4jUtil.identificationPersonName(postagger,segmentor,words);
double identificationAddress = Ltp4jUtil.identificationAddress(postagger,segmentor,words);
double identificationGroups = Ltp4jUtil.identificationGroups(postagger,segmentor,words);
double identificationGovernment = Ltp4jUtil.identificationGovernment(postagger,segmentor,words);
postagger.release();
segmentor.release();
System.out.println("地点:"+identificationAddress);
System.out.println("人名:"+identificationName);
System.out.println("企业:"+identificationGroups);
System.out.println("公检法:"+identificationGovernment);
*/
String content="上海丰临进出口有限公司";
LoadLibsUtil.addDirToPath("libs");
NER ner = new NER();
Postagger postagger = new Postagger();
Segmentor segmentor = new Segmentor();
if (ner.create("D:\\test\\ltp\\ner.model") < 0) {
log.error("pos.model加载失败");
}
if (postagger.create("D:\\test\\ltp\\pos.model") < 0) {
log.error("pos.model加载失败");
}
if (segmentor.create("D:\\test\\ltp\\cws.model") < 0) {
log.error("load cws.model failed");
}
Boolean s=isCorporateGroups(postagger,segmentor,ner,content);
System.out.println(s);
postagger.release();
segmentor.release();
ner.release();
}
}
package com.dbapp.database.scanning.util;
import java.lang.reflect.Field;
/**
* @ClassName: LoadLibsUtil
* @Author: yongtao.ding on 2020/5/26 9:09
*/
public class LoadLibsUtil {
public static void addDirToPath(String s){
try {
//获取系统path变量对象
Field field=ClassLoader.class.getDeclaredField("sys_paths");
//设置此变量对象可访问
field.setAccessible(true);
//获取此变量对象的值
String[] path=(String[])field.get(null);
//创建字符串数组,在原来的数组长度上增加一个,用于存放增加的目录
String[] tem=new String[path.length+1];
//将原来的path变量复制到tem中
System.arraycopy(path,0,tem,0,path.length);
//将增加的目录存入新的变量数组中
tem[path.length]=s;
//将增加目录后的数组赋给path变量对象
field.set(null,tem);
} catch (Exception e) {
e.printStackTrace();
}
}
}
关于加载dll:
代码中
LoadLibsUtil.addDirToPath("libs");