import nltk
from nltk.corpus import names
import random
names=[(name,'male') for name in names.words('male.txt'),
(name,'female') for name in names.word('female.txt')] #已经分好类的数据集
random.shuffle(names) #随机排列元素
def gender_features(word):
return {'lastword':word[-1]} #函数:对数据进行处理
test_names=names[:500] #测试集原始数据
devtest_names=names[500:1000] #开发测试集原始数据
train_names=names[1000:] #训练集原始数据
test_set=[(gender_features(n),g) for (n,g) in test_names] #测试集
dev_test=[(gender_features(n),g) for (n,g) in devtest_names] #开发测试集
train_set=[(gender_featutes(n),g) for (n,g) in train_names] #训练集
classifier=nltk.NaiveBayesClassifier.train(train_set) #训练贝叶斯分类器
classifier.classify(gender_features('Neo')) #测试分类器
print nltk.classify.accuracy(classifier,devtest_set #1.0 测试评估
print nltk.classify.accuracy(classifier,test_set) #1.0 测试评估
from nltk.corpus import senseval
instances=senseval.instances('hard.pos')
size=int(len(instances)*0.1)
tarin_set,test_set=instances[size:],instances[:size]