文章目录
- 绘制带条件的频率分布表
- 评估词汇多样性
- 分词
-
- 常用的基础函数
绘制带条件的频率分布表
#布朗语料库
import nltk
from nltk.corpus import brown
brown.categories()
#带条件的频率分布函数
cfd = nltk.ConditionalFreqDist(
(genre,word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
genres = brown.categories()
cfd.tabulate(conditions = genres, samples=modals)
评估词汇多样性
#词汇多样性
def lexical_diversity(text):
return len(text)/len(set(text))
def percentage(count,total):
return 100 * count / total
lexical_diversity(text4)
percentage(text4.count("a"),len(text4))
分词
自定义函数实现分词
def segment(text,segs):
words = []
last = 0
for i in range(len(segs)):
if segs[i] == "1":
words.append(text[last:i+1])
last = i+1
words.append(text[last:])
return words
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "000000000000000100000000001000000000000000010000000000001"
seg2 = "01001001001000010010010000101001000100100001000100100001"
seg1_out = segment(text,seg1)
seg2_out = segment(text,seg2)
def evaluate(text,segs):
words = segment(text,segs)
text_size = len(words)
lexicon_size = len(" ".join(list(set(words))))
return text_size + lexicon_size
使用模拟退火算法的非确定搜索
from random import randint
def flip(segs, pos):
return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]
def flip_n(segs, n):
for i in range(n):
segs = flip(segs, randint(0, len(segs)-1))
return segs
def anneal(text, segs, iterations, cooling_rate):
temperature = float(len(segs))
while temperature > 0.5:
best_segs, best = segs, evaluate(text, segs)
for i in range(iterations):
guess = flip_n(segs, int(round(temperature)))
score = evaluate(text, guess)
if score < best:
best, best_segs = score, guess
score, segs = best, best_segs
temperature = temperature / cooling_rate
print(evaluate(text,segs),segment(text,segs))
print(segs)
return segs
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "00000000000000010000000000100000000000000001000000000000"
anneal(text, seg1, 5000, 1.2)
常用的基础函数
import nltk
from nltk.book import *
#concordance()搜索指定单词,连同上下文一起显示
text1.concordance("monstrous")
#可找出与monstrous有相似上下文的词语
text1.similar("monstrous")
#可找出两个或两个以上的词共同的上下文
text2.common_contexts(["monstrous","very"])