TF-IDF

参考源:
http://www.ruanyifeng.com/blog/2013/03/tf-idf.html 写的很明了
package com.data.text.tfidf;



import java.io.BufferedReader;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Collections;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Map.Entry;



public class TF_IDF {



    private  double NUM_DOCS;



    private  Map<String, Integer> idf_map;

    

    public TF_IDF(String fileName){

        idf_map = new HashMap<String, Integer>();

        File file = new File(fileName);

        BufferedReader reader = null;

        try {

            reader = new BufferedReader(new FileReader(file));

            String tempString = null;



            //第一行为Num_docs

            tempString = reader.readLine();

            NUM_DOCS = (double)Integer.parseInt(tempString);

            

            // 一次读入一行,直到读入null为文件结束

            while ((tempString = reader.readLine()) != null) {

                String[] arr = tempString.split(" : ");

                String key = arr[0];

                Integer value = Integer.parseInt(arr[1]);

                idf_map.put(key, value);

            }

            reader.close();

        } catch (IOException e) {

            e.printStackTrace();

        } finally {

            if (reader != null) {

                try {

                    reader.close();

                } catch (IOException e1) {

                }

            }

        }

    }

    

    



    public List<Feature> cacu(Map<String, Integer> tf_map) {



        // 统计总词数

        Integer word_num_sum = 0;

        for (Entry<String, Integer> entry : tf_map.entrySet()) {

            word_num_sum += entry.getValue();

        }

        

        //计算tf-idf

        List<Feature> list_fea = new ArrayList<Feature>();

        for (Entry<String, Integer> entry : tf_map.entrySet()) {

            String word = entry.getKey();

            Integer num = entry.getValue();

            double tf = (double) num / word_num_sum;

            double idf = Math.log(NUM_DOCS / idf_map.get(word) + 1);//+1平滑 逆文档频率

            double weight = tf * idf;

            list_fea.add(new Feature(word, num, weight));

        }

        

        //根据权重排序

        Collections.sort(list_fea);



        return list_fea;

    }



    public static void main(String[] args) {

        // TODO Auto-generated method stub



    }



}





package com.data.text.tfidf;



import java.io.BufferedReader;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

import java.util.HashSet;

import java.util.Set;



public class StopWord {

    

    public static Set<String> GetStopWords(){

        String fileName = "stopwords.txt";

        return readwords(fileName);

    }

    

    /**

     * 读取停用词表

     * @param fileName

     * @return

     */

    private static Set<String> readwords(String fileName){

        Set<String> set = new HashSet<String>();

        File file = new File(fileName);

        BufferedReader reader = null;

        try {

            reader = new BufferedReader(new FileReader(file));

            String tempString = null;

            

            // 一次读入一行,直到读入null为文件结束

            while ((tempString = reader.readLine()) != null) {

                set.add(tempString.trim());                

            }

            reader.close();

        } catch (IOException e) {

            e.printStackTrace();

        } finally {

            if (reader != null) {

                try {

                    reader.close();

                } catch (IOException e1) {

                }

            }

        }

        return set;

    }

}





package com.data.text.tfidf;



/**

 * 特征词

 * @author root

 *

 */

public class Feature implements Comparable<Feature> {

    private String word;

    private Integer num;

    private double weight;



    public Feature(String word, Integer num, double weight) {

        this.word = word;

        this.num = num;

        this.weight = weight;

    }



    public String getWord() {

        return word;

    }



    public Integer getNum() {

        return num;

    }



    public double getWeight() {

        return weight;

    }



    @Override

    public int compareTo(Feature o) {

        if(this.getWeight() == o.getWeight()){

            return 0;

        }else if(this.getWeight() > o.getWeight()){

            return -1;

        }else{

            return 1;

        }

    }

    

    public String toString(){

        return this.word + " freq: " + num + " weight: " + weight;

    }

}

 

__author__ = 'dell'



import math

import re

from operator import itemgetter





class TfIdf:

    def __init__(self, corpus_filename = None, stopword_filename = None, DEFAULT_IDF = 1.5):

        self.num_docs = 0

        self.term_num_docs = {}

        self.stopwords = []

        self.idf_default = DEFAULT_IDF



        if corpus_filename:

            corpus_file = open(corpus_filename, 'r')

            #load num of documents

            line = corpus_file.readline()

            self.num_docs = int(line)

            #read term:frequency from each subsequent line in the file

            for line in corpus_file:

                tokens = line.split(':')

                term = tokens[0].strip()

                frequency = int(tokens[1].strip())

                self.term_num_docs[term] = frequency



        if stopword_filename:

            stopword_file = open(stopword_filename)

            self.stopwords = [line.strip() for line in stopword_file]



    def get_tokens(self, str):

        return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())



    def add_input_document(self, input):

        self.num_docs += 1

        words = set(self.get_tokens(input))

        for word in words:

            if word in self.term_num_docs:

                self.term_num_docs[word] += 1

            else:

                self.term_num_docs[word] = 1



    def get_num_docs(self):

        return self.num_docs



    def get_idf(self, term):

        if term in self.stopwords:

            return 0

        if term not in self.term_num_docs:

            return self.idf_default

        return math.log(float(1 + self.get_num_docs()) / (1 + self.term_num_docs[term]))



    def get_doc_keywords(self, curr_doc):

        tfidf = {}

        tokens = self.get_tokens(curr_doc)

        tokens_set = set(tokens)

        for word in tokens_set:

            tf = float(tokens.count(word) / len(tokens))

            idf = self.get_idf(word)

            tfidf[word] = tf * idf

        return sorted(tfidf.items(), key=itemgetter(1), reverse=True)

 

你可能感兴趣的:(id)