使用sklearn CountVectorizer 实现n-gram

#coding=utf-8
'''
Created on 2018-1-25

'''

from sklearn.feature_extraction.text import CountVectorizer

text = ["A smile is the most charming part of a person forever.","A smile is"]

# ngram_range=(2, 2)表明适应2-gram,decode_error="ignore"忽略异常字符,token_pattern按照单词切割
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
                                        token_pattern = r'\b\w+\b',min_df=1)

x1 = ngram_vectorizer.fit_transform(text)
print x1
# (0, 7)    1
# (0, 0)    1
# (0, 5)    1
# (0, 6)    1
# (0, 2)    1
# (0, 4)    1
# (0, 9)    1
# (0, 3)    1
# (0, 8)    1
# (0, 1)    1
# (1, 8)    1
# (1, 1)    1
print x1.toarray()
# [[1 1 1 1 1 1 1 1 1 1]
#  [0 1 0 0 0 0 0 0 1 0]]
# 查看生成的词表
print ngram_vectorizer.vocabulary_
# {u'person forever': 7, u'part of': 6, u'smile is': 8, u'a smile': 1, u'of a': 5, u'the most': 9, u'is the': 3, u'charming part': 2, u'a person': 0, u'most charming': 4}

# 如果ngram_range=(2, 4),则表示2,3,4个单词切割
ngram_vectorizer = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",
                                        token_pattern = r'\b\w+\b',min_df=1)
x1 = ngram_vectorizer.fit_transform(text)
print x1
# (0, 16)    1
# (0, 19)    1
# (0, 7)    1
# (0, 13)    1
# (0, 26)    1
# (0, 10)    1
# (0, 23)    1
# (0, 4)    1
# (0, 1)    1
# (0, 15)    1
# (0, 18)    1
# (0, 6)    1
# (0, 12)    1
# (0, 25)    1
# (0, 9)    1
# (0, 22)    1
# (0, 3)    1
# (0, 20)    1
# (0, 0)    1
# (0, 14)    1
# (0, 17)    1
# (0, 5)    1
# (0, 11)    1
# (0, 24)    1
# (0, 8)    1
# (0, 21)    1
# (0, 2)    1
# (1, 3)    1
# (1, 21)    1
# (1, 2)    1
print ngram_vectorizer.vocabulary_
# {u'smile is': 21, u'charming part of a': 7, u'a smile': 2, u'part of': 17, u'is the most charming': 10, u'the most': 24, u'of a person forever': 16, u'the most charming': 25, u'most charming part': 12, u'is the': 8, u'charming part': 5, u'most charming': 11, u'part of a': 18, u'smile is the most': 23, u'person forever': 20, u'is the most': 9, u'most charming part of': 13, u'of a': 14, u'smile is the': 22, u'charming part of': 6, u'a person forever': 1, u'the most charming part': 26, u'a smile is the': 4, u'part of a person': 19, u'a smile is': 3, u'a person': 0, u'of a person': 15}

你可能感兴趣的:(机器学习,sklearn,n-gram,机器学习,人工智能)