CODE:
#!/usr/bin/python # -*- coding: utf-8 -*- ''' Created on 2014-9-8 @author: guaguastd @name: tf_idf_sample.py ''' from tfIdf import tf, tf_idf, idf # Enter in a query term from the corpus variable QUERY_TERMS = ['mr.', 'green'] corpus = \ {'a': 'Mr. Green killed Colonel Mustard in the study with the candlestick. \ Mr. Green is not a very nice fellow.', 'b': 'Professor Plum has a green plant in his study.', 'c': "Miss Scarlett watered Professor Plum's green plant while he was away \ from his office last week."} for (k, v) in sorted(corpus.items()): print k, ':', v print # Score queries by calculating cumulative tf_idf score for each term in query query_scores = {'a':0, 'b':0, 'c':0} for term in [t.lower() for t in QUERY_TERMS]: for doc in sorted(corpus): print 'TF(%s): %s' % (doc, term), tf(term, corpus[doc]) print 'IDF: %s' % (term, ), idf(term, corpus.values()) print for doc in sorted(corpus): score = tf_idf(term, corpus[doc], corpus.values()) print 'TF-IDF(%s): %s' % (doc, term), score query_scores[doc] += score print print "Overall TF-IDF scores for query '%s'" % (' '.join(QUERY_TERMS), ) for (doc, score) in sorted(query_scores.items()): print doc, score
a : Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow. b : Professor Plum has a green plant in his study. c : Miss Scarlett watered Professor Plum's green plant while he was away from his office last week. TF(a): mr. 0.105263157895 TF(b): mr. 0.0 TF(c): mr. 0.0 IDF: mr. 2.09861228867 TF-IDF(a): mr. 0.220906556702 TF-IDF(b): mr. 0.0 TF-IDF(c): mr. 0.0 TF(a): green 0.105263157895 TF(b): green 0.111111111111 TF(c): green 0.0625 IDF: green 1.0 TF-IDF(a): green 0.105263157895 TF-IDF(b): green 0.111111111111 TF-IDF(c): green 0.0625 Overall TF-IDF scores for query 'mr. green' a 0.326169714597 b 0.111111111111 c 0.0625