文本特征抽取

字典特征抽取

#文本特征抽取
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

dict_ = [
    {'name':'name1','hobby':'h1','age':18},
    {'name':'name2','hobby':'h2','age':30},
    {'name':'name1','hobby':'h3','age':28},
]
dictv = DictVectorizer()
#稀松矩阵 one-hot 独热码
data = dictv.fit_transform(dict_).toarray()

col = dictv.get_feature_names()
df = pd.DataFrame(data=data,columns=col)
image.png

文本内容抽取

pip install jieba
from sklearn.feature_extraction.text import CountVectorizer
import jieba

li = ['饿了吗,我下面给你吃,美国',
'容我插一下嘴,美国',
'做我的女朋友一定要喜欢吃海鲜,美国',
'每天只想与你做四件事,一日三餐,美国',
'我有一个大的荷尔蒙想要放,美国']

jieba_data = []
for i in li:
    jieba_data.append(' '.join(jieba.lcut(i)))


cv = CountVectorizer()
data = cv.fit_transform(jieba_data).toarray()

col = cv.get_feature_names()
df = pd.DataFrame(data=data,columns=col)
image.png

重要程度分析 Tfidf

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tf_data = tfidf.fit_transform(jieba_data).toarray()
col = tfidf.get_feature_names()

df2 = pd.DataFrame(data=tf_data,columns=col)
image.png

你可能感兴趣的:(文本特征抽取)