Python自然语言处理——nltk库入门之文本分词(英文)

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import nltk.tokenize as tk

#需要分词的文本
doc = "Are you ok? \
I'm fun,and you? \
I'm ok."

#文本分句
tokens = tk.sent_tokenize(doc)
for i, token in enumerate(tokens):
    print('%2d' % (i + 1), token)
print('-' * 10)

#文本分词
tokenizer = tk.WordPunctTokenizer()
tokens = tokenizer.tokenize(doc)
for i, token in enumerate(tokens):
    print('%2d' % (i + 1), token)

你可能感兴趣的:(科技,程序员)