Deepti Chopra(印度)
王威 译
import nltk
text=" Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import sent_tokenize #(把文本切分成句子)
print(sent_tokenize(text)) #(在命令行可以不用print()输出,但是在软件中应有print())
加载PunktSentenceTokenizer并使用其tokenizer()函数来进行切分:
import nltk
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
text=" Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
print(tokenizer.tokenize(text))
import nltk
french_tokenizer=nltk.data.load('tokenizers/punkt/french.pickle')
print(french_tokenizer.tokenize('Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage collège franco-britanniquedeLevallois-Perret. Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage Levallois. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , d’un professeur d’histoire. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , d’un professeur d’histoire'))
Word_tokenize函数使用Nltk包的一个叫做TreebankWordTokenizer类的示例用于执行单词的切分。
#把文本切分成单词,标点符号也切分开
import nltk
text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .")
print(text)
#计算文本单词数
import nltk
from nltk import word_tokenize
r=raw_input("Please write a text")
print("The length of text is",len(word_tokenize(r)),"words")
#把文本切分成单词,标点符号不切分
import nltk
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting"))
TreebankWordTokenizer依据Penn Treebank语料库的约定,通过分离缩略词来实现切分。
PunkWordTokenizer(分类器)是通过分离标点来实现切分的,每一个单词都会被保留,而不是去创建一个新的标识符。
WordPunctTokenizer(分类器)通过将标点转化为一个全新的标识符来实现切分(常用)。
#文本切分成单词,don't切分成do和n't
import nltk
text=nltk.word_tokenize(" Don't hesitate to ask questions")
print(text)
#文本切分成单词,don't切分成don,‘和t
from nltk.tokenize import WordPunctTokenizer
tokenizer=WordPunctTokenizer()
print(tokenizer.tokenize(" Don't hesitate to ask questions"))
导入NLTK包中的RegexpTokenizer模块,并构建一个 与文本中的标识符相匹配的正则表达式:
#(文本切分成单词,don't不切分)
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer("[\w']+")
print(tokenizer.tokenize("Don't hesitate to ask questions"))
#(文本切分成单词,don't切分成do和n't)
import nltk
from nltk.tokenize import regexp_tokenize
sent="Don't hesitate to ask questions"
print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))
#(通过空格来执行分切,与1.1.5第一个例子执行结果相同)
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer('\s+',gaps=True)
#(筛选以大写字母开头的单词,单个字母大写不筛选)
print(tokenizer.tokenize("Don't hesitate to ask questions"))
sent="SHe secured 90.56% in class XXXX.She is a meritorious student"
capt=RegexpTokenizer('[A-Z]\w+')
print(capt.tokenize(sent))
#(预定义正则表达式,输出结果和sent中的结果格式一模一样)
import nltk
sent=" She secured 90.56 % in class X . She is a meritorious student"
from nltk.tokenize import BlanklineTokenizer
print(BlanklineTokenizer().tokenize(sent))
#(通过空格间隔换行等把文本切分成单个输出,90.56%会被分成90.56和%,don't不会被分开,super-men不会被分开)
import nltk
sent=" She secured 90.56 % in class X . She is a meritorious student"
from nltk.tokenize import WhitespaceTokenizer
print(WhitespaceTokenizer().tokenize(sent))
import nltk
sent= "She secured 90.56 % in class X. She is a meritorious student"
print(sent.split()) #(和上面的切分一模一样)
print(sent.split(' ')) #(和上面的切分一模一样)
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" #(切分成三个句子)
print(sent.split('\n'))
import nltk
from nltk.tokenize import BlanklineTokenizer
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(BlanklineTokenizer().tokenize(sent))
from nltk.tokenize import LineTokenizer
print(LineTokenizer(blanklines='keep').tokenize(sent))
print(LineTokenizer(blanklines='discard').tokenize(sent))
import nltk
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
from nltk.tokenize import SpaceTokenizer
print(SpaceTokenizer().tokenize(sent)) #(前面会产生一个空的 ' ',)
print(sent.split(' ')) #(和SpaceTokenizer产生结果相同)
“”“(返回每个单词、标点符号(有\n才会和前面的单词分开),%的元组形式的序列。该序列的组成为(单词第一个字母的位置,偏移量【偏移量-单词第一个字母的位置=单词的字母数】)eg: she前有个空格,所以she的序列为(1,4) 。 \n算一个字母(四个字节),\n不计入紧挨的单词或符号)”“”
import nltk
from nltk.tokenize import WhitespaceTokenizer
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(WhitespaceTokenizer().span_tokenize(sent)))
“”“(返回每个单词、标点符号,%的元组形式的跨度序列。该序列的组成为(后一个单词的【单词第一个字母位置】-前一个单词的【偏移量】,单词的字母数))”“”
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize.util import spans_to_relative
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))))
“”“(返回每个单词、标点符号(有\n才会和前面的单词分开),%的元组形式的序列。该序列的组成为(单词第一个字母的位置,偏移量【偏移量-单词第一个字母的位置=单词的字母数】)eg: she前有个空格,所以she的序列为(1,4) 。 \n算一个字母(四个字节),\n计入紧挨的单词或符号)”“”
import nltk
from nltk.tokenize.util import string_span_tokenize
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(string_span_tokenize(sent, " ")))
#消除每句子的标点符号
import re
import string
text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]
from nltk.tokenize import word_tokenize
tokenized_docs=[word_tokenize(doc) for doc in text]
x=re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
new_review = []
for token in review:
new_token = x.sub(u'', token)
if not new_token == u'':
new_review.append(new_token)
tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)
####1.2.2文本的大小写转换
text='HARdWork IS KEy to SUCCESS'
print(text.lower()) #把文本中的内容都转换成小写
print(text.upper()) #把文本中的内容都转换成大写
停用词是指在执行信息检索任务或其他自然语言任务时需要被过滤掉的词。
import nltk #先需要下载stopwords包(nltk.download('stopwords'))
from nltk.corpus import stopwords
stops=set(stopwords.words('english')) #删除停止词(停止词是指执行信息检索时被过滤掉的词,使得缩小搜索范围【搜索引擎常用】)
words=["Don't", 'hesitate','to','ask','questions']
print([word for word in words if word not in stops])
print(stopwords.fileids()) #显示能删除停止词的语种
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english')) #显示英语中的停止词
def para_fraction(text):
stopwords = nltk.corpus.stopwords.words('english')
para = [w for w in text if w.lower() not in stopwords]
return len(para) / len(text)
print(para_fraction(nltk.corpus.reuters.words()))
print(para_fraction(nltk.corpus.inaugural.words()))
#replacers.py(编写正则表达式来进行缩略词的替换、重复字符的删除、同义词替换)
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
执行缩略词的替换:
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
from nltk.corpus import wordnet
执行重复字符的删除:
class RepeatReplacer(object):
def __init__(self):
self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
self.repl = r'\1\2\3'
def replace(self, word):
if wordnet.synsets(word): #避免出现正确单词被篡改,可把happpy编程happy
return word
repl_word = self.repeat_regexp.sub(self.repl, word)
if repl_word != word:
return self.replace(repl_word)
else:
return repl_word
#调用replacers中的RegexpReplacer()类实现缩略词的替换
import nltk
from replacers import RegexpReplacer
replacer= RegexpReplacer()
replacer.replace("Don't hesitate to ask questions")
print(replacer.replace("She must've gone to the market but she didn't go"))
#执行文本切分前对缩略词进行替换
import nltk
from nltk.tokenize import word_tokenize
from replacers import RegexpReplacer
replacer=RegexpReplacer()
word_tokenize("Don't hesitate to ask questions")
print(word_tokenize(replacer.replace("Don't hesitate to ask questions")))
#调用replacers中的RepeatReplacer()类实现重复字符的删除
import nltk
from replacers import RepeatReplacer
replacer=RepeatReplacer()
print(replacer.replace('happy'))
#调用replacers中的WordReplacer()类实现同义词替换
import nltk
from replacers import WordReplacer
replacer=WordReplacer({'congrats':'congratulations'})
#输入congrats出现congratulations,输入congratulations返回单词的本身
print(replacer.replace('congrats'))
print(replacer.replace('maths'))
import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('TkAgg')
fd = FreqDist()
for text in gutenberg.fileids():
for word in gutenberg.words(text):
fd.inc(word)
ranks = []
freqs = []
for rank, word in enumerate(fd):
ranks.append(rank+1)
freqs.append(fd[word])
plt.loglog(ranks, freqs)
plt.xlabel('frequency(f)', fontsize=14, fontweight='bold')
plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
plt.grid(True)
plt.show()
#网上改进版,还存在一点问题
import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('TkAgg')
fd = FreqDist()
for text in gutenberg.fileids():
# 每篇文章的每个单词
for word in gutenberg.words(text):
# value + 1
fd[word] += 1
print(list(fd.items())[:5])
fd_sorted = sorted(fd.items(), key=lambda x: x[1], reverse=True)
print(fd_sorted[:5])
fd_sorted_dict = dict(fd_sorted)
ranks = []
freqs = []
for rank, word in enumerate(fd_sorted_dict):
ranks.append(rank + 1)
freqs.append(fd_sorted_dict[word])
plt.loglog(ranks, freqs)
plt.xlabel('frequency(f)', fontsize=14, fontweight='bold')
plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
plt.grid(True)
plt.show()
使用标准分来分析命名实体识别器的输出:
from __future__ import print_function
from nltk.metrics import *
training='PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
testing='PERSON OTHER OTHER OTHER OTHER OTHER'.split()
print(accuracy(training,testing))
trainset=set(training)
testset=set(testing)
print(precision(trainset,testset))
print(recall(trainset,testset))
print(f_measure(trainset,testset))
操作包含一下内容:
nltk.metrics包的Edit Distance算法:
from __future__ import print_function
def _edit_dist_init(len1, len2):
lev = []
for i in range(len1):
lev.append([0] * len2) # initialize 2D array to zero
for i in range(len1):
lev[i][0] = i # column 0: 0,1,2,3,4,...
for j in range(len2):
lev[0][j] = j # row 0: 0,1,2,3,4,...
return lev
def _edit_dist_step(lev,i,j,s1,s2,transpositions=False):
c1 =s1[i-1]
c2 =s2[j-1]
a =lev[i-1][j] +1 # skipping a character in s1
b =lev[i][j -1]+1 # skipping a character in s2
c =lev[i-1][j-1]+(c1!=c2) # substitution
d =c+1 # never picked by default # transposition
if transpositions and i>1 and j>1:
if s1[i -2]==c2 and s2[j -2]==c1:
d =lev[i-2][j-2]+1
lev[i][j] =min(a,b,c,d) # pick the cheapest
def edit_distance(s1, s2, transpositions=False):
len1 = len(s1) # set up a 2-D array
len2 = len(s2)
lev = _edit_dist_init(len1 + 1, len2 + 1)
for i in range(len1): # iterate over the array
for j in range(len2):
_edit_dist_step(lev, i + 1, j + 1, s1, s2,transpositions=transpositions)
return lev[len1][len2]
nltk.metrics包来计算编辑距离:
import nltk
from nltk.metrics import *
print(edit_distance("relate","relation"))
print(edit_distance("suggestion","calculation"))
两集合X,Y,交集的相似程度可以定义如下:
Jaccard(X,Y)=|XnY|/|XuY|
Jaccard(X,X)=1
Jaccard(X,Y)=0 if XnY=0
Jaccard相似性系数实现:
import nltk
from nltk.metrics import *
X=set([10,20,30,40])
Y=set([20,30,60])
print(jaccard_distance(X,Y))
Smith Waterman距离算法相关定义:
NLTK中实现二进制距离算法度量:
import nltk
from nltk.metrics import *
X = set([10,20,30,40])
Y= set([30,50,70])
print(binary_distance(X, Y))
NLTK中实现masi距离算法度量:
import nltk
from nltk.metrics import *
X = set([10,20,30,40])
Y= set([30,50,70])
print(masi_distance(X, Y))