是由tf和idf两部分相乘得到
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())
print(X.shape)
结果
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0. 0.46979139 0.58028582 0.38408524 0. 0.
0.38408524 0. 0.38408524]
[0. 0.6876236 0. 0.28108867 0. 0.53864762
0.28108867 0. 0.28108867]
[0.51184851 0. 0. 0.26710379 0.51184851 0.
0.26710379 0.51184851 0.26710379]
[0. 0.46979139 0.58028582 0.38408524 0. 0.
0.38408524 0. 0.38408524]]
(4, 9)
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]
vectorizer = TfidfVectorizer(ngram_range=(1,5))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
结果
['and', 'and this', 'and this is', 'and this is the', 'and this is the third', 'document', 'document is', 'document is the', 'document is the second', 'document is the second document', 'first', 'first document', 'is', 'is the', 'is the first', 'is the first document', 'is the second', 'is the second document', 'is the third', 'is the third one', 'is this', 'is this the', 'is this the first', 'is this the first document', 'one', 'second', 'second document', 'the', 'the first', 'the first document', 'the second', 'the second document', 'the third', 'the third one', 'third', 'third one', 'this', 'this document', 'this document is', 'this document is the', 'this document is the second', 'this is', 'this is the', 'this is the first', 'this is the first document', 'this is the third', 'this is the third one', 'this the', 'this the first', 'this the first document']
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]
vectorizer = TfidfVectorizer(analyzer='char_wb')
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
结果
[' ', '.', '?', 'a', 'c', 'd', 'e', 'f', 'h', 'i', 'm', 'n', 'o', 'r', 's', 't', 'u']
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]
vectorizer = TfidfVectorizer(analyzer='char_wb',ngram_range=(2,5))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names(),X.toarray().shape)
结果
[' a', ' an', ' and', ' and ', ' d', ' do', ' doc', ' docu', ' f', ' fi', ' fir', ' firs', ' i', ' is', ' is ', ' o', ' on', ' one', ' one.', ' s', ' se', ' sec', ' seco', ' t', ' th', ' the', ' the ', ' thi', ' thir', ' this', '. ', '? ', 'an', 'and', 'and ', 'co', 'con', 'cond', 'cond ', 'cu', 'cum', 'cume', 'cumen', 'd ', 'do', 'doc', 'docu', 'docum', 'e ', 'e.', 'e. ', 'ec', 'eco', 'econ', 'econd', 'en', 'ent', 'ent ', 'ent.', 'ent. ', 'ent?', 'ent? ', 'fi', 'fir', 'firs', 'first', 'he', 'he ', 'hi', 'hir', 'hird', 'hird ', 'his', 'his ', 'ir', 'ird', 'ird ', 'irs', 'irst', 'irst ', 'is', 'is ', 'me', 'men', 'ment', 'ment ', 'ment.', 'ment?', 'nd', 'nd ', 'ne', 'ne.', 'ne. ', 'nt', 'nt ', 'nt.', 'nt. ', 'nt?', 'nt? ', 'oc', 'ocu', 'ocum', 'ocume', 'on', 'ond', 'ond ', 'one', 'one.', 'one. ', 'rd', 'rd ', 'rs', 'rst', 'rst ', 's ', 'se', 'sec', 'seco', 'secon', 'st', 'st ', 't ', 't.', 't. ', 't?', 't? ', 'th', 'the', 'the ', 'thi', 'thir', 'third', 'this', 'this ', 'um', 'ume', 'umen', 'ument'] (4, 138)