1.提取文件夹下所有文档数据,提取关键词(本框架基于JFinal)
/**
* 递归读取文件路径下的所有文件
*
* @param path
* @param fileNameList
* @return
*/
public static ArrayList readFiles1(String path, ArrayList fileNameList) {
File file = new File(path);
if (file.isDirectory()) {
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++) {
if (files[i].isDirectory()) {
readFiles1(files[i].getPath(), fileNameList);
} else {
String path1 = files[i].getPath();
String fileName = path1.substring(path1.lastIndexOf("\\") + 1);
fileNameList.add(fileName);
}
}
} else {
String path1 = file.getPath();
String fileName = path1.substring(path1.lastIndexOf("\\") + 1);
fileNameList.add(fileName);
}
return fileNameList;
}
/**
* 读取word文档
* @paramstring path
* @return:string buffer
* */
public static String readWord(String path) {
String buffer = "";
try {
if (path.endsWith(".doc")) {
InputStream is = new FileInputStream(new File(path));
WordExtractor ex = new WordExtractor(is);
buffer = ex.getText();
ex.close();
} else if (path.endsWith("docx")) {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
buffer = extractor.getText();
extractor.close();
} else {
System.out.println("此文件不是word文件!");
}
} catch (Exception e) {
e.printStackTrace();
}
int index = buffer.indexOf("关于");
String a = "";
if(index!=-1){
String words = buffer.substring(index);
int indexNew = words.indexOf("补充侦查提纲");
if(indexNew!=-1){
a = words.substring(indexNew+6);
}
}
return a;
}
/**
*
* @param map HashMap 按照值进行排序
* @return:返回排序后的Map
*/
public HashMap hashMapSort(HashMap map){
//1、按顺序保存map中的元素,使用LinkedList类型
List> keyList = new LinkedList>(map.entrySet());
//2、按照自定义的规则排序
Collections.sort(keyList, new Comparator>() {
@Override
public int compare(Entry o1,
Entry o2) {
if(o2.getValue().compareTo(o1.getValue())>0){
return 1;
}else if(o2.getValue().compareTo(o1.getValue())<0){
return -1;
} else {
return 0;
}
}
});
//3、将LinkedList按照排序好的结果,存入到HashMap中
HashMap result=new LinkedHashMap<>();
for(Entry entry:keyList){
result.put(entry.getKey(),entry.getValue());
}
return result;
}
/**
*
* @paramword文档入数据库
* @return:true
*/
@RequiresPermissions("admin:hanlpTest:insertWord")
public void insertWord(){
HanlpWord hw = new HanlpWord();
Db.update("truncate table hanlp_word");
String filePath = "D:\\xxxxxxxxxxx\\";
ArrayList fileNameList = readFiles1(filePath, new ArrayList());
for (int i = 0; i < fileNameList.size(); i++) {
String uuid = UUID.randomUUID().toString().replaceAll("-", "");
String trainFile = readWord(filePath+fileNameList.get(i));
String readline = trainFile.replaceAll("[\\pP\\p{Punct}]", "").replaceAll("(\r\n|\r|\n|\n\r)", "").trim();
if (StrKit.notBlank(fileNameList.get(i))) {
hw.set("id",uuid).set("title",fileNameList.get(i)).set("text",readline).set("cjsj",new Date());
hw.save();
}
}
renderJson(new Result().setStatus(true).setInfo("WORD文档入库成功,共计"+ fileNameList.size()+"条。"));
}
/**
* 关键字入库
*/
@RequiresPermissions("")
public void keywords(){
HanlpKeywords hk = new HanlpKeywords();
Db.update("truncate table hanlp_keywords");
List keywordList = HanlpWord.dao.getWordList();
for (Record keyword:keywordList){
List termList = NLPTokenizer.segment(keyword.getStr("text"));
Map keywordMap = new TextRankKeyword().getTermAndRank(termList);
for(String key:keywordMap.keySet()){
if(StrKit.isBlank(key)){
continue;
}
Float value = keywordMap.get(key);
String uuid = UUID.randomUUID().toString().replaceAll("-", "");
hk.set("id",uuid).set("keyword",key).set("times",value).set("title",keyword.getStr("title")).set("cjsj",new Date()).save();
}
}
renderJson(new Result().setStatus(true).setInfo("提取关键字成功"));
}
/**
* 添加过滤关键字
*/
@RequiresPermissions("")
public void update(){
boolean flag = false;
HanlpTest hanlpTest = new HanlpTest();
String uuid = UUID.randomUUID().toString().replaceAll("-", "");
CoreStopWordDictionary.add(getPara("describe"));
flag = hanlpTest.set("remkeyword",getPara("describe").trim()).set("id",uuid).set("cjsj",new Date()).save();
Result result = new Result().setStatus(flag).setInfo("添加成功");
renderJson(result);
}
/**
* 删除过滤关键字
*/
@RequiresPermissions( "")
@Before({Tx.class})
public void delete() {
Result result = new Result();
String id = getPara("id");
HanlpTest hanlpTest = HanlpTest.dao.findById(id);
if(null!=hanlpTest){
CoreStopWordDictionary.remove(hanlpTest.getStr("remkeyword").trim());
boolean flag = HanlpTest.dao.deleteById(hanlpTest.getStr("id"));
if(flag) {
saveLog(logType.DELETE, "删除数据源【"+hanlpTest.getStr("remkeyword")+"】成功");
result.setStatus(true).setInfo("删除成功");
}
}
renderJson(result);
}
/**
* 获取echarts数据数据
*/
@RequiresPermissions("admin:hanlpTest:show")
public void getData(){
List echatsList = HanlpKeywords.dao.getEchatsData();
HashMap pplHashMap = new HashMap();
for (Record keyWordNum:echatsList){
pplHashMap.put(keyWordNum.getStr("keyword"),keyWordNum.getInt("num"));
}
if(echatsList.size()<=0){
pplHashMap.put("暂无数据",10000);
}
renderJson(pplHashMap);
}
2.新浪微博数据关键词分析(同理文档分析)(微博数据爬取通过webcontroller即可)