当提取文本的特征时,通常需要将文本转换为数值表示,以便将其输入到机器学习模型中。以下是一个使用scikit-learn库的文本特征提取示例:
以下是从文本中提取特征的一些建议:
{"我": 1, "要": 1, "从": 1, "文本": 1, "提取": 1, "如下": 1, "特征": 1}
2-gram: ["我要", "要从", "从文本", "文本提取", "提取如下", "如下特征"]
[("我", "代词"), ("要", "动词"), ("从", "介词"), ("文本", "名词"), ("提取", "动词"), ("如下", "形容词"), ("特征", "名词")]
情感分析结果: 中立
示例: "OpenAI是一家位于美国的人工智能研究实验室。"
实体识别结果: [("OpenAI", "组织"), ("美国", "地名"), ("人工智能", "领域"), ("研究实验室", "机构")]
主题建模结果: ["人工智能", "自然语言处理"]
from sklearn.feature_extraction.text import CountVectorizer
# 创建CountVectorizer对象
vectorizer = CountVectorizer()
# 假设我们有一个包含文本的列表
text = ["This is the first document.", "This is the second document.", "And this is the third one.", "Is this the first document?"]
# 将文本转换为数值表示
X = vectorizer.fit_transform(text)
# 查看特征词
print(vectorizer.get_feature_names())
# 查看特征矩阵
print(X.toarray())
国内镜像ChatMind AI
群体分析和异常分析。让我们回顾一下,本书中所谓异常分析,指的 是辨别与正常或通常秩序、形式或规则发生偏离的情况的含义;或者 辨别某些应该出现但却没有出现的问题的含义。当群体进行移动或变 化时(无论是变成更大,还是变小和变得不重要),异常分析的结果 都是可以识别的。异常分析包含三种基线,分别是文化基线、技术基 线和功能基线。异常分析依靠这些基线来判断哪些是“正常的”。 一 般来说,群体的组成、发展、影响和最终消亡,就是基线出现扰动的 原因。群体的内容包括群体的构成、发展、与其他群体的关联、运动 和影响等等,每一方面都涉及基线的变化。这些变化有时很容易发 现,但有时却是人的意识难以发现的。因此,异常分析及其对基线的 依赖,就是通过这一交集与群体分析产生了互动。
异常分析 辨别 与正常或通常秩序、形式或规则发生偏离的情况
异常分析 可以用于辨别 某些应该出现但却没有出现的问题
异常分析 基于 文化基线、技术基线和基本线
群体的组成、发展、影响和最终消亡 是 基线出现扰动的原因
群体内容 包括 群体的构成、发展、与其他群体的关联、运动和影响等
对应输入就可以
主谓宾提取_nlp 主谓宾_Penno_彭儒的博客-CSDN博客
# Copyright 2017 Peter de Vocht
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import en_core_web_sm
from collections.abc import Iterable
# use spacy small model
nlp = en_core_web_sm.load()
# dependency markers for subjects
SUBJECTS = {"nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"}
# dependency markers for objects
OBJECTS = {"dobj", "dative", "attr", "oprd"}
# POS tags that will break adjoining items
BREAKER_POS = {"CCONJ", "VERB"}
# words that are negations
NEGATIONS = {"no", "not", "n't", "never", "none"}
# does dependency set contain any coordinating conjunctions?
def contains_conj(depSet):
return "and" in depSet or "or" in depSet or "nor" in depSet or \
"but" in depSet or "yet" in depSet or "so" in depSet or "for" in depSet
# get subs joined by conjunctions
def _get_subs_from_conjunctions(subs):
more_subs = []
for sub in subs:
# rights is a generator
rights = list(sub.rights)
rightDeps = {tok.lower_ for tok in rights}
if contains_conj(rightDeps):
more_subs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
if len(more_subs) > 0:
more_subs.extend(_get_subs_from_conjunctions(more_subs))
return more_subs
# get objects joined by conjunctions
def _get_objs_from_conjunctions(objs):
more_objs = []
for obj in objs:
# rights is a generator
rights = list(obj.rights)
rightDeps = {tok.lower_ for tok in rights}
if contains_conj(rightDeps):
more_objs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
if len(more_objs) > 0:
more_objs.extend(_get_objs_from_conjunctions(more_objs))
return more_objs
# find sub dependencies
def _find_subs(tok):
head = tok.head
while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
head = head.head
if head.pos_ == "VERB":
subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
if len(subs) > 0:
verb_negated = _is_negated(head)
subs.extend(_get_subs_from_conjunctions(subs))
return subs, verb_negated
elif head.head != head:
return _find_subs(head)
elif head.pos_ == "NOUN":
return [head], _is_negated(tok)
return [], False
# is the tok set's left or right negated?
def _is_negated(tok):
parts = list(tok.lefts) + list(tok.rights)
for dep in parts:
if dep.lower_ in NEGATIONS:
return True
return False
# get all the verbs on tokens with negation marker
def _find_svs(tokens):
svs = []
verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
for v in verbs:
subs, verbNegated = _get_all_subs(v)
if len(subs) > 0:
for sub in subs:
svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
return svs
# get grammatical objects for a given set of dependencies (including passive sentences)
def _get_objs_from_prepositions(deps, is_pas):
objs = []
for dep in deps:
if dep.pos_ == "ADP" and (dep.dep_ == "prep" or (is_pas and dep.dep_ == "agent")):
objs.extend([tok for tok in dep.rights if tok.dep_ in OBJECTS or
(tok.pos_ == "PRON" and tok.lower_ == "me") or
(is_pas and tok.dep_ == 'pobj')])
return objs
# get objects from the dependencies using the attribute dependency
def _get_objs_from_attrs(deps, is_pas):
for dep in deps:
if dep.pos_ == "NOUN" and dep.dep_ == "attr":
verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
if len(verbs) > 0:
for v in verbs:
rights = list(v.rights)
objs = [tok for tok in rights if tok.dep_ in OBJECTS]
objs.extend(_get_objs_from_prepositions(rights, is_pas))
if len(objs) > 0:
return v, objs
return None, None
# xcomp; open complement - verb has no suject
def _get_obj_from_xcomp(deps, is_pas):
for dep in deps:
if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
v = dep
rights = list(v.rights)
objs = [tok for tok in rights if tok.dep_ in OBJECTS]
objs.extend(_get_objs_from_prepositions(rights, is_pas))
if len(objs) > 0:
return v, objs
return None, None
# get all functional subjects adjacent to the verb passed in
def _get_all_subs(v):
verb_negated = _is_negated(v)
subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
if len(subs) > 0:
subs.extend(_get_subs_from_conjunctions(subs))
else:
foundSubs, verb_negated = _find_subs(v)
subs.extend(foundSubs)
return subs, verb_negated
# find the main verb - or any aux verb if we can't find it
def _find_verbs(tokens):
verbs = [tok for tok in tokens if _is_non_aux_verb(tok)]
if len(verbs) == 0:
verbs = [tok for tok in tokens if _is_verb(tok)]
return verbs
# is the token a verb? (excluding auxiliary verbs)
def _is_non_aux_verb(tok):
return tok.pos_ == "VERB" and (tok.dep_ != "aux" and tok.dep_ != "auxpass")
# is the token a verb? (excluding auxiliary verbs)
def _is_verb(tok):
return tok.pos_ == "VERB" or tok.pos_ == "AUX"
# return the verb to the right of this verb in a CCONJ relationship if applicable
# returns a tuple, first part True|False and second part the modified verb if True
def _right_of_verb_is_conj_verb(v):
# rights is a generator
rights = list(v.rights)
# VERB CCONJ VERB (e.g. he beat and hurt me)
if len(rights) > 1 and rights[0].pos_ == 'CCONJ':
for tok in rights[1:]:
if _is_non_aux_verb(tok):
return True, tok
return False, v
# get all objects for an active/passive sentence
def _get_all_objs(v, is_pas):
# rights is a generator
rights = list(v.rights)
objs = [tok for tok in rights if tok.dep_ in OBJECTS or (is_pas and tok.dep_ == 'pobj')]
objs.extend(_get_objs_from_prepositions(rights, is_pas))
#potentialNewVerb, potentialNewObjs = _get_objs_from_attrs(rights)
#if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
# objs.extend(potentialNewObjs)
# v = potentialNewVerb
potential_new_verb, potential_new_objs = _get_obj_from_xcomp(rights, is_pas)
if potential_new_verb is not None and potential_new_objs is not None and len(potential_new_objs) > 0:
objs.extend(potential_new_objs)
v = potential_new_verb
if len(objs) > 0:
objs.extend(_get_objs_from_conjunctions(objs))
return v, objs
# return true if the sentence is passive - at he moment a sentence is assumed passive if it has an auxpass verb
def _is_passive(tokens):
for tok in tokens:
if tok.dep_ == "auxpass":
return True
return False
# resolve a 'that' where/if appropriate
def _get_that_resolution(toks):
for tok in toks:
if 'that' in [t.orth_ for t in tok.lefts]:
return tok.head
return None
# simple stemmer using lemmas
def _get_lemma(word: str):
tokens = nlp(word)
if len(tokens) == 1:
return tokens[0].lemma_
return word
# print information for displaying all kinds of things of the parse tree
def printDeps(toks):
for tok in toks:
print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])
# expand an obj / subj np using its chunk
def expand(item, tokens, visited):
if item.lower_ == 'that':
temp_item = _get_that_resolution(tokens)
if temp_item is not None:
item = temp_item
parts = []
if hasattr(item, 'lefts'):
for part in item.lefts:
if part.pos_ in BREAKER_POS:
break
if not part.lower_ in NEGATIONS:
parts.append(part)
parts.append(item)
if hasattr(item, 'rights'):
for part in item.rights:
if part.pos_ in BREAKER_POS:
break
if not part.lower_ in NEGATIONS:
parts.append(part)
if hasattr(parts[-1], 'rights'):
for item2 in parts[-1].rights:
if item2.pos_ == "DET" or item2.pos_ == "NOUN":
if item2.i not in visited:
visited.add(item2.i)
parts.extend(expand(item2, tokens, visited))
break
return parts
# convert a list of tokens to a string
def to_str(tokens):
if isinstance(tokens, Iterable):
return ' '.join([item.text for item in tokens])
else:
return ''
# find verbs and their subjects / objects to create SVOs, detect passive/active sentences
def findSVOs(tokens):
svos = []
is_pas = _is_passive(tokens)
verbs = _find_verbs(tokens)
visited = set() # recursion detection
for v in verbs:
subs, verbNegated = _get_all_subs(v)
# hopefully there are subs, if not, don't examine this verb any longer
if len(subs) > 0:
isConjVerb, conjV = _right_of_verb_is_conj_verb(v)
if isConjVerb:
v2, objs = _get_all_objs(conjV, is_pas)
for sub in subs:
for obj in objs:
objNegated = _is_negated(obj)
if is_pas: # reverse object / subject for passive
svos.append((to_str(expand(obj, tokens, visited)),
"!" + v.lemma_ if verbNegated or objNegated else v.lemma_, to_str(expand(sub, tokens, visited))))
svos.append((to_str(expand(obj, tokens, visited)),
"!" + v2.lemma_ if verbNegated or objNegated else v2.lemma_, to_str(expand(sub, tokens, visited))))
else:
svos.append((to_str(expand(sub, tokens, visited)),
"!" + v.lower_ if verbNegated or objNegated else v.lower_, to_str(expand(obj, tokens, visited))))
svos.append((to_str(expand(sub, tokens, visited)),
"!" + v2.lower_ if verbNegated or objNegated else v2.lower_, to_str(expand(obj, tokens, visited))))
else:
v, objs = _get_all_objs(v, is_pas)
for sub in subs:
if len(objs) > 0:
for obj in objs:
objNegated = _is_negated(obj)
if is_pas: # reverse object / subject for passive
svos.append((to_str(expand(obj, tokens, visited)),
"!" + v.lemma_ if verbNegated or objNegated else v.lemma_, to_str(expand(sub, tokens, visited))))
else:
svos.append((to_str(expand(sub, tokens, visited)),
"!" + v.lower_ if verbNegated or objNegated else v.lower_, to_str(expand(obj, tokens, visited))))
else:
# no obj - just return the SV parts
svos.append((to_str(expand(sub, tokens, visited)),
"!" + v.lower_ if verbNegated else v.lower_,))
return svos