6 学习分类文章

本章学习应该回答的三个问题:

1. 怎样识别用于分类文章的显著特征?

2. 怎样构建语言模型,用于自动语言处理?

3。 通过这些模型我们可以了解语言的什么方面?

importnltk

fromnltk.corpusimportnames, movie_reviews,brown

importrandom

 def gender_features(word):

     return {'last_letter':word[-1]}

#

# labled_names=([(name,'male') for name in names.words('male.txt')]+

#              [(name,'female') for name in names.words('female.txt')])

#

# randem_names=random.shuffle(labled_names)

#

# featuresets=[(gender_features(n),gender)for (n,gender)in labled_names]

#

# train_set, test_set=featuresets[:500],featuresets[500:]

#

# classifier=nltk.NaiveBayesClassifier.train(train_set)

#

# print(classifier.classify(gender_features('Shasha')))

# print(nltk.classify.accuracy(classifier, test_set))

# print(classifier.show_most_informative_features(6))

# def gender_features2(name): #特征集,找出所有可能的特征,效果不一定好

#    features={}

#    features['first_letter']=name[0].lower()

#    features['last_letter']=name[-1].lower()

#    for letter in 'abcdefghijklmnopqrstuvwxyz':

#        features['count({})'.format(letter)]=name.lower().count(letter)

#        features['has({})'.format(letter)]=(letter in name.lower())

#

#    return features

#

# print(gender_features2('John'))

#一旦特征集被选定,完善可以通过错误分析,如下

# train_names=labled_names[1500:]

# devtest_names=labled_names[500:1500]

# test_names=labled_names[:500]

# train_set=[(gender_features(n),gender) for (n,gender) in train_names]

# devtest_set=[(gender_features(n),gender) for (n,gender) in devtest_names]

# test_set=[(gender_features(n),gender) for (n,gender) in test_names]

# classifier=nltk.NaiveBayesClassifier.train(train_set)

# print(nltk.classify.accuracy(classifier,devtest_set))

#使用dev/test 我们可以识别一系列在预测时产生的错误

# errors=[]

# for (name,tag) in devtest_names:

#    guess=classifier.classify(gender_features(name))

#    if guess != tag:

#        errors.append((tag,guess,name))

#

# for (tag,guess,name)in sorted(errors)[:10]:

#    print('correct={} guess={} name={}'.format(tag,guess,name))

#分析结果显示,后两位字母组合也是较为显著特征集,所以应该修改之前的特征集

#

# def gender_features(word):

#    return {'suffix1':word[-1:],

#            'suffix2':word[-2:]}

#该方法可以反复使用 直到 找出最佳特征集。

#为文献资料分类

#print(movie_reviews.fileids())

#print(movie_reviews.categories())

# documents=[(list(movie_reviews.words(fileid)),category)

#            for category in movie_reviews.categories()

#            for fileid in movie_reviews.fileids(category)]

# random.shuffle(documents)

#

# all_words=nltk.FreqDist(w.lower() for w in movie_reviews.words())

# word_features=list(all_words)[:2000]

#

# def document_features(document):

#    document_words=set(document)

#    features={}

#    for word in word_features:

#        features['contains({})'.format(word)]=(word in document_words)

#    return features

#

#

# featuresets=[(document_features(d), c)

#              for (d,c) in documents]

# train_set,test_set=featuresets[100:],featuresets[:100]

# classifier=nltk.NaiveBayesClassifier.train(train_set)

# print(nltk.classify.accuracy(classifier,test_set))

# print(classifier.show_most_informative_features(6))

# Part-of-Speech-Tagging

# suffix_fdist=nltk.FreqDist()

# for word in brown.words():

#    word=word.lower()

#    suffix_fdist[word[-1:]] += 1

#    suffix_fdist[word[-2:]] += 1

#    suffix_fdist[word[-3:]] += 1

# common_suffix=[suffix for (suffix,count)in suffix_fdist.most_common(100)]

#

# def pos_features(word):

#    features={}

#    for suffix in common_suffix:

#        features['endswith({})'.format(suffix)]=word.lower().endswith(suffix)

#    return features

#

# tagged_words=brown.tagged_words(categories='news')

# featuresets=[(pos_features(n),g) for (n,g) in tagged_words]

# size=int(len(featuresets)*0.1)

# train_set,test_set=featuresets[size:],featuresets[:size]

#

# classifier=nltk.DecisionTreeClassifier.train(train_set)

# print(classifier.classify(pos_features('name')))

# print(nltk.classify.accuracy(classifier,test_set))

# Exploting Context 下面方法无法泛化,因为它没有获得前面词的此行标记,因此缺乏实际使用意义

# def pos_features(sentence,i):

#    features={'suffix(1)':sentence[i][-1:],

#              'suffix(2)':sentence[i][-2:],

#              'suffix(3)':sentence[i][-3:]}

#    if i==0:

#        features['prev-word']=''

#    else:

#        features['prev-word']=sentence[i-1]

#    return features

#

# tagged_sents=brown.tagged_sents(categories='news')

# featuresets=[]

# for tagged_sent in tagged_sents:

#    untagged_sent = nltk.tag.untag(tagged_sent)

#    for i, (word,tag) in enumerate(tagged_sent):

#        featuresets.append((pos_features(untagged_sent,i),tag))

#

# size=int(len(featuresets)*0.1)

# train_set,test_set=featuresets[size:],featuresets[:size]

# classifier=nltk.NaiveBayesClassifier.train(train_set)

# print(nltk.classify.accuracy(classifier,test_set))

# print(classifier.show_most_informative_features(6))

#序列分类:连续分类或贪婪序列分类:为第一个输入找到最优标签,然后使用这个问题的答案帮助找到下一个最优标签,

# 为此我们得扩展以上方法的特征集,增加前面单词的此类标注

# def pos_features(sentence,i,history):

#    features={'suffix(1)':sentence[i][-1:],

#              'suffix(2)':sentence[i][-2:],

#              'suffix(3)':sentence[i][-3:]}

#    if i==0:

#        features['prev-word']=''

#        features['prev-tag'] =''

#    else:

#        features['prev-word']=sentence[i - 1]

#        features['prev-tag'] = history[i - 1]

#    return features

#

# class ConsecutivePosTagger(nltk.TaggerI):

#

#    def __init__(self,train_sents):

#        train_set=[]

#        for tagged_sent in train_sents:

#            untagged_sent = nltk.tag.untag(tagged_sent)

#            history=[]

#            for i, (word,tag) in enumerate(tagged_sent):

#                featureset=pos_features(untagged_sent,i,history)

#                train_set.append((featureset,tag))

#                history.append(tag)

#        self.classifier= nltk.NaiveBayesClassifier.train(train_set)

#

#    def tag(self,sentence):

#        history=[]

#        for i, word in enumerate(sentence):

#            featureset=pos_features(sentence,i,history)

#            #print(featureset)

#            tag = self.classifier.classify(featureset)

#            history.append(tag)

#

#        return zip(sentence,history)

#

# tagged_sents=brown.tagged_sents(categories='news')

# size=int(len(tagged_sents)*0.1)

# train_sents,test_sents=tagged_sents[size:],tagged_sents[:size]

# tagger=ConsecutivePosTagger(train_sents)

#print(list(tagger.tag(brown.sents(categories='romance')[1])))

#print(list(tagger.tag(brown.tagged_sents(categories='news')[1])))

#print(tagger.evaluate(test_sents))

#监督性学习其余案例

#分割句子

# sents=nltk.corpus.treebank_raw.sents()

# tokens=[]

# boundaries=set()

# offset=0

# for sent in sents:

#    tokens.extend(sent)

#    offset+=len(sent)

#    boundaries.add(offset-1)

# #下一步是确定可能指示句号是否为边界的特征

# def punct_features(tokens, i):

#    return{'next-word-capitalized':tokens[i+1][0].isupper(),

#            'prev-word':tokens[i-1].lower(),

#            'punct':tokens[i],

#            'prev-word-is-one-char':len(tokens[i-1])==1}

#

# featuresets=[(punct_features(tokens,i),(i in boundaries))

#              for i in range(1,len(tokens)-1)

#              if tokens[i] in '.?!']

#

# size=int(len(featuresets)*0.1)

# train_set,test_set=featuresets[size:],featuresets[:size]

# classifier=nltk.NaiveBayesClassifier.train(train_set)

# print(nltk.classify.accuracy(classifier,test_set))

#

# #使用该句子分割器

#

# def segment_sentences(words):

#    start=0

#    sents=[]

#    for i, word in enumerate(words):

#        if word in '.?!' and classifier.clssify(punct_features(words,i))==True:

#            sents.append(words[start:i+1])

#            starts=i+1

#    if start

#        sents.append(words[start:])

#

#    return sents

#识别对话行为》识别对话行为是理解对话中句子意思的第一步

posts=nltk.corpus.nps_chat.xml_posts()[:2]

defdialogue_act_features(post):

features={}

forwordinnltk.word_tokenize(post):

features['contains({})'.format(word.lower())]=True

returnfeatures

featuresets=[(dialogue_act_features(post.text),post.get('class'))

forpostinposts]

size=int(len(featuresets)*0.1)

train_set,test_set=featuresets[size:],featuresets[:size]

classifier=nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier,test_set))

#套路都是一样:建立特征提取器,建立特征数据集,训练分类器。并检查其效率

你可能感兴趣的:(6 学习分类文章)