在使用HanLP框架的过程中,发现其内置的CoreStopWordDictionary类只有删除、添加方法,没有动态的重载。而项目的需求确实可以动态加载,用他内置的方法去全删掉,然后一个个添加,在直观上感觉没有那么优美。。。所以准备重新写一个服务在做停用词的加载。
新建服务类:
CoreStopwordService
首先,HanLP的停用词记载是从其资源包中的stopword.txt下读取的数据加载,其加载方法在CoreStopWordDictionary的static静态块中存在:
static {
ByteArray byteArray = ByteArray.createByteArray(Config.CoreStopWordDictionaryPath + ".bin");
if (byteArray == null) {
try {
dictionary = new StopWordDictionary(Config.CoreStopWordDictionaryPath);
DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(Config.CoreStopWordDictionaryPath + ".bin"));
dictionary.save(out);
out.close();
} catch (Exception var2) {
Predefine.logger.severe("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败" + TextUtility.exceptionToString(var2));
throw new RuntimeException("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败");
}
} else {
dictionary = new StopWordDictionary();
dictionary.load(byteArray);
}
FILTER = new Filter() {
public boolean shouldInclude(Term term) {
String nature = term.nature != null ? term.nature.toString() : "空";
char firstChar = nature.charAt(0);
switch(firstChar) {
case 'b':
case 'c':
case 'e':
case 'm':
case 'o':
case 'p':
case 'q':
case 'r':
case 'u':
case 'w':
case 'y':
case 'z':
return false;
case 'd':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'n':
case 's':
case 't':
case 'v':
case 'x':
default:
return !CoreStopWordDictionary.contains(term.word);
}
}
};
}
将其改成在新服务的PostConstruct中加载:
@PostConstruct
public void load() {
ByteArray byteArray = ByteArray.createByteArray(Config.CoreStopWordDictionaryPath + ".bin");
if (byteArray == null) {
try {
if (Files.size(Paths.get(Config.CoreStopWordDictionaryPath)) == 0){
return ;
}
dictionary = new StopWordDictionary(Config.CoreStopWordDictionaryPath);
DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(Config.CoreStopWordDictionaryPath + ".bin"));
dictionary.save(out);
out.close();
} catch (Exception var2) {
Predefine.logger.severe("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败" + TextUtility.exceptionToString(var2));
throw new RuntimeException("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败");
}
} else {
dictionary = new StopWordDictionary();
dictionary.load(byteArray);
}
}
其他方法不变,再添加一个reload方法:
@Transactional(readOnly = true)
public void reload() {
log.info("开始重新加载停用词库...");
try (Stream stream = stopwordRepository.findByProject_Id(customDictionaryService.getEffectiveProjectId())){
String text = stream.map(Stopword::getWord).collect(Collectors.joining("\n"));
if (StringUtils.isEmpty(text)){
log.info("停用词库为空,停止加载");
return ;
}
try {
Files.write(Paths.get(Config.CoreStopWordDictionaryPath),text.getBytes());
} catch (IOException e) {
e.printStackTrace();
}
}
try {
Files.delete(Paths.get(Config.CoreStopWordDictionaryPath + ".bin"));
} catch (IOException e) {
e.printStackTrace();
}
load();
log.info("停用词库加载完成...");
}
stopwordRepository.findByProject_Id(customDictionaryService.getEffectiveProjectId()))
这个是我用jpa的记载数据的方法,这里可以替换成你项目中的数据查询方法。
这样就可以在项目中任何位置重载停用词了。
CoreStopwordService的完整代码如下:
/**
* @description: 自定义停用词服务
* @author: chenyang
* @create: 2018-12-14
**/
@Slf4j
@Service
public class CoreStopwordService {
static StopWordDictionary dictionary;
public static Filter FILTER;
public final StopwordRepository stopwordRepository;
private final CustomDictionaryService customDictionaryService;
public CoreStopwordService(StopwordRepository stopwordRepository, CustomDictionaryService customDictionaryService) {
this.stopwordRepository = stopwordRepository;
this.customDictionaryService = customDictionaryService;
}
static {
FILTER = new Filter() {
public boolean shouldInclude(Term term) {
String nature = term.nature != null ? term.nature.toString() : "空";
char firstChar = nature.charAt(0);
switch(firstChar) {
case 'b':
case 'c':
case 'e':
case 'm':
case 'o':
case 'p':
case 'q':
case 'r':
case 'u':
case 'w':
case 'y':
case 'z':
return false;
case 'd':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'n':
case 's':
case 't':
case 'v':
case 'x':
default:
return !CoreStopWordDictionary.contains(term.word);
}
}
};
}
@Transactional(readOnly = true)
public void reload() {
log.info("开始重新加载停用词库...");
try (Stream stream = stopwordRepository.findByProject_Id(customDictionaryService.getEffectiveProjectId())){
String text = stream.map(Stopword::getWord).collect(Collectors.joining("\n"));
if (StringUtils.isEmpty(text)){
log.info("停用词库为空,停止加载");
return ;
}
try {
Files.write(Paths.get(Config.CoreStopWordDictionaryPath),text.getBytes());
} catch (IOException e) {
e.printStackTrace();
}
}
try {
Files.delete(Paths.get(Config.CoreStopWordDictionaryPath + ".bin"));
} catch (IOException e) {
e.printStackTrace();
}
load();
log.info("停用词库加载完成...");
}
public static boolean contains(String key) {
return dictionary.contains(key);
}
public static boolean shouldInclude(Term term) {
return FILTER.shouldInclude(term);
}
public static boolean shouldRemove(Term term) {
return !shouldInclude(term);
}
public static boolean add(String stopWord) {
return dictionary.add(stopWord);
}
public static boolean remove(String stopWord) {
return dictionary.remove(stopWord);
}
public static void apply(List termList) {
ListIterator listIterator = termList.listIterator();
while(listIterator.hasNext()) {
if (shouldRemove((Term)listIterator.next())) {
listIterator.remove();
}
}
}
@PostConstruct
public void load() {
ByteArray byteArray = ByteArray.createByteArray(Config.CoreStopWordDictionaryPath + ".bin");
if (byteArray == null) {
try {
if (Files.size(Paths.get(Config.CoreStopWordDictionaryPath)) == 0){
return ;
}
dictionary = new StopWordDictionary(Config.CoreStopWordDictionaryPath);
DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(Config.CoreStopWordDictionaryPath + ".bin"));
dictionary.save(out);
out.close();
} catch (Exception var2) {
Predefine.logger.severe("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败" + TextUtility.exceptionToString(var2));
throw new RuntimeException("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败");
}
} else {
dictionary = new StopWordDictionary();
dictionary.load(byteArray);
}
}
}