java hanlp 语义分析 提取关键词

1.提取文件夹下所有文档数据,提取关键词(本框架基于JFinal)

 /**
     * 递归读取文件路径下的所有文件
     * 
     * @param path
     * @param fileNameList
     * @return
     */
    public  static ArrayList readFiles1(String path, ArrayList fileNameList) {
        File file = new File(path);
        if (file.isDirectory()) {
            File[] files = file.listFiles();
            for (int i = 0; i < files.length; i++) {
                if (files[i].isDirectory()) {
                    readFiles1(files[i].getPath(), fileNameList);
                } else {
                    String path1 = files[i].getPath();
                    String fileName = path1.substring(path1.lastIndexOf("\\") + 1);
                    fileNameList.add(fileName);
                }
            }
        } else {
            String path1 = file.getPath();
            String fileName = path1.substring(path1.lastIndexOf("\\") + 1);
            fileNameList.add(fileName);
        }
        return fileNameList;
    }
  /**
     * 读取word文档
     * @paramstring path
     * @return:string buffer
     * */
    public static String readWord(String path) {
        String buffer = "";
        try {
            if (path.endsWith(".doc")) {
                InputStream is = new FileInputStream(new File(path));
                WordExtractor ex = new WordExtractor(is);
                buffer = ex.getText();
                ex.close();
            } else if (path.endsWith("docx")) {
                OPCPackage opcPackage = POIXMLDocument.openPackage(path);
                POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
                buffer = extractor.getText();
                extractor.close();
            } else {
                System.out.println("此文件不是word文件!");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
       int index =  buffer.indexOf("关于");
        String a = "";
        if(index!=-1){
            String  words = buffer.substring(index);
            int indexNew = words.indexOf("补充侦查提纲");
            if(indexNew!=-1){
                a = words.substring(indexNew+6);
            }
        }
        return a;
    }
/**
     *
     * @param map HashMap 按照值进行排序
     * @return:返回排序后的Map
     */
    public  HashMap  hashMapSort(HashMap map){
        //1、按顺序保存map中的元素,使用LinkedList类型
        List> keyList = new LinkedList>(map.entrySet());
        //2、按照自定义的规则排序
        Collections.sort(keyList, new Comparator>() {
            @Override
            public int compare(Entry o1,
                               Entry o2) {
                if(o2.getValue().compareTo(o1.getValue())>0){
                    return 1;
                }else if(o2.getValue().compareTo(o1.getValue())<0){
                    return -1;
                }  else {
                    return 0;
                }
            }
        });
        //3、将LinkedList按照排序好的结果,存入到HashMap中
        HashMap result=new LinkedHashMap<>();
        for(Entry entry:keyList){
            result.put(entry.getKey(),entry.getValue());
        }
        return result;
    }
 /**
     * 
     * @paramword文档入数据库
     * @return:true
     */
    @RequiresPermissions("admin:hanlpTest:insertWord")
    public  void insertWord(){
        HanlpWord hw = new HanlpWord();
        Db.update("truncate table hanlp_word");
        String filePath = "D:\\xxxxxxxxxxx\\";
            ArrayList fileNameList = readFiles1(filePath, new ArrayList());
            for (int i = 0; i < fileNameList.size(); i++) {
                String uuid = UUID.randomUUID().toString().replaceAll("-", "");
                String trainFile = readWord(filePath+fileNameList.get(i));
                String readline = trainFile.replaceAll("[\\pP\\p{Punct}]", "").replaceAll("(\r\n|\r|\n|\n\r)", "").trim();
                if (StrKit.notBlank(fileNameList.get(i))) {
                    hw.set("id",uuid).set("title",fileNameList.get(i)).set("text",readline).set("cjsj",new Date());
                    hw.save();
                }
            }
            renderJson(new Result().setStatus(true).setInfo("WORD文档入库成功,共计"+ fileNameList.size()+"条。"));
    }
    /**
     * 关键字入库
     */
    @RequiresPermissions("")
    public void keywords(){
        HanlpKeywords hk = new HanlpKeywords();
        Db.update("truncate table hanlp_keywords");
        List keywordList =  HanlpWord.dao.getWordList();
        for (Record keyword:keywordList){
            List  termList =  NLPTokenizer.segment(keyword.getStr("text"));
            Map keywordMap = new TextRankKeyword().getTermAndRank(termList);
            for(String key:keywordMap.keySet()){
                if(StrKit.isBlank(key)){
                    continue;
                }
                Float value = keywordMap.get(key);
                String uuid = UUID.randomUUID().toString().replaceAll("-", "");
                hk.set("id",uuid).set("keyword",key).set("times",value).set("title",keyword.getStr("title")).set("cjsj",new Date()).save();
            }
        }
        renderJson(new Result().setStatus(true).setInfo("提取关键字成功"));
    }
/**
     * 添加过滤关键字
     */
    @RequiresPermissions("")
    public void update(){
        boolean flag = false;
        HanlpTest hanlpTest = new HanlpTest();
        String uuid = UUID.randomUUID().toString().replaceAll("-", "");
        CoreStopWordDictionary.add(getPara("describe"));
        flag =  hanlpTest.set("remkeyword",getPara("describe").trim()).set("id",uuid).set("cjsj",new Date()).save();
        Result result = new Result().setStatus(flag).setInfo("添加成功");
        renderJson(result);
    }
    /**
     * 删除过滤关键字
     */
    @RequiresPermissions( "")
    @Before({Tx.class})
    public void delete() {
        Result result = new Result();
        String id = getPara("id");
        HanlpTest hanlpTest = HanlpTest.dao.findById(id);
        if(null!=hanlpTest){
            CoreStopWordDictionary.remove(hanlpTest.getStr("remkeyword").trim());
            boolean flag = HanlpTest.dao.deleteById(hanlpTest.getStr("id"));
            if(flag) {
                saveLog(logType.DELETE, "删除数据源【"+hanlpTest.getStr("remkeyword")+"】成功");
                result.setStatus(true).setInfo("删除成功");
            }
        }
        renderJson(result);
    }
/**
     * 获取echarts数据数据
     */
   @RequiresPermissions("admin:hanlpTest:show")
    public void getData(){
       List echatsList =  HanlpKeywords.dao.getEchatsData();
       HashMap pplHashMap = new HashMap();
       for (Record keyWordNum:echatsList){
           pplHashMap.put(keyWordNum.getStr("keyword"),keyWordNum.getInt("num"));
       }
       if(echatsList.size()<=0){
           pplHashMap.put("暂无数据",10000);
       }
       renderJson(pplHashMap);
    }
2.新浪微博数据关键词分析(同理文档分析)(微博数据爬取通过webcontroller即可)

你可能感兴趣的:(语义分析)