TFIDF算法,java代码实现

TFIDF算法,java代码实现

TF-IDF算法全称为term frequency–inverse document frequency。TF就是term frequency的缩写,意为词频。IDF则是inverse document frequency的缩写,意为逆文档频率。
TFIDF算法,java代码实现_第1张图片

package com.aawant.nlp.featureExtraction.tfidf;

import java.io.BufferedReader;
import java.io.File;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;


public class TFIDF {
    /**
     * 
     * @param p_path1 文章路径,文件格式  
     * @param p_path2 文档路径
     * @return 词,TFIDF值
     * @throws Exception
     */
    public static Map getTFIDT(String p_path1, String p_path2)
            throws Exception {
        File file1 = new File(p_path1); 
        File file2 = new File(p_path2); 
        Map term_freq_map = new HashMap();// 词,该词在该文档中出现的频率
        Map doc_freq_map = new HashMap(); // 词,文档频率
        int doc_numbers_total = 0; // 总的文档数目
        Double doc_numbers_term = 0.0;// 包含该词条的文档数目
        //词频
        BufferedReader br_doc = null;
        br_doc = StringUtils.getBr(file1, "UTF-8");
        String line = "";
        while ((line = br_doc.readLine()) != null) {
            String[] temp_words = line.split(" ");
            for (String w : temp_words) {
                if (term_freq_map.containsKey(w)) {
                    Double value = term_freq_map.get(w) + 1.0;
                    term_freq_map.put(w, value);
                } else {
                    term_freq_map.put(w, 1.0);
                }
            }
        }
        br_doc.close();
        // 文档频率
        Set set = term_freq_map.keySet();
        Iterator iter = set.iterator();
        File[] files = file2.listFiles();
        doc_numbers_total = files.length;
        BufferedReader br_file = null;
        String temp2 = "";
        while (iter.hasNext()) {
            temp2 = (String) iter.next();
            String temp = "";
            for (File f : files) {
                StringBuilder sb = new StringBuilder();
                br_file = StringUtils.getBr(f, "utf-8");
                while ((temp = br_file.readLine()) != null) {
                    String words = temp;
                    sb.append(words);
                }
                String words_adoc = sb.toString();
                if (words_adoc.contains(temp2)) {
                    doc_numbers_term += 1.0;
                    doc_freq_map.put(temp2, doc_numbers_term);
                }
                br_file.close();
            }
            doc_numbers_term = 0.0;
        }
        /*
         * 计算TFIDF值
         * w = 词频*log(总的文档数目/文档频率)
         */
        Map re_map = new HashMap();
        String term = "";
        Set set_t = term_freq_map.keySet();
        Iterator iter_t = set_t.iterator();
        while(iter_t.hasNext()){
            term = (String) iter_t.next();
            Double A = term_freq_map.get(term);
            Double B = doc_freq_map.get(term);
            Double result = A*Math.log(doc_numbers_total/B)/Math.log(2.0);
            re_map.put(term, result);
        }   
        return re_map;
    }

}

你可能感兴趣的:(文字处理,java算法)