pip install nltk
至此,我们的安装还未完成,还需要下载NLTK语料库,下载量非常大,大约有1.8GB。可以直接运行代码下载、代码如下:
import nltk
nltk.download()
这样可以直接下载NLTK语料库了。
import nltk
sw=set(nltk.corpus.stopwords.words('french'))
print "Stop words",list(sw)[:7]
Stop words [u'e\xfbtes', u'\xeates', u'aient', u'auraient', u'aurions', u'auras', u'serait']
注意,这个语料库中的所有单词都是小写形式。
gb=nltk.corpus.gutenberg
print "Gutenberg files",gb.fileids()[-5:]
运行结果:
Gutenberg files [u'milton-paradise.txt', u'shakespeare-caesar.txt', u'shakespeare-hamlet.txt', u'shakespeare-macbeth.txt', u'whitman-leaves.txt']
从milton-paradise.txt中取前两个句子,并去除停用词。
text_sent=gb.sents("milton-paradise.txt")[:2] #取前两个句子
print "Unfiltered:",text_sent
for sent in text_sent: #去除停用词
filtered=[w for w in sent if w.lower() not in sw]
print "Filtered:",filtered
运行结果:
Filtered: [u'[', u'Paradise', u'Lost', u'John', u'Milton', u'1667', u']']
Filtered: [u'Book']
与前面相比已经滤掉了by和I,因为他们出现在停用词语料库中,有时,我们希望把文本中的数字和姓名也删掉,可以根据词性标签来删除某些单词,数字对应基数标签(CD),姓名对应着单数形式的专有名词(NNP)标签。
#coding:utf8
import nltk
sw=set(nltk.corpus.stopwords.words('english'))
print "Stop words",list(sw)[:7]
gb=nltk.corpus.gutenberg
print "Gutenberg files",gb.fileids()[-5:]
text_sent=gb.sents("milton-paradise.txt")[:2] #取前两个句子
print "Unfiltered:",text_sent
for sent in text_sent: #去除停用词
filtered=[w for w in sent if w.lower() not in sw]
print "Filtered:",filtered
taggled=nltk.pos_tag(filtered) #输出每个词的标签数据
print "Tagged:",taggled
words=[]
for word in taggled: #过滤标签数据
if word[1]!='NNP' and word[1]!='CD':
words.append(word[0])
print words
Stop words [u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves']
Gutenberg files [u'milton-paradise.txt', u'shakespeare-caesar.txt', u'shakespeare-hamlet.txt', u'shakespeare-macbeth.txt', u'whitman-leaves.txt']
Unfiltered: [[u'[', u'Paradise', u'Lost', u'by', u'John', u'Milton', u'1667', u']'], [u'Book', u'I']]
Filtered: [u'[', u'Paradise', u'Lost', u'John', u'Milton', u'1667', u']']
Tagged: [(u'[', 'JJ'), (u'Paradise', 'NNP'), (u'Lost', 'NNP'), (u'John', 'NNP'), (u'Milton', 'NNP'), (u'1667', 'CD'), (u']', 'NN')]
[u'[', u']']
Filtered: [u'Book']
Tagged: [(u'Book', 'NN')]
[u'Book']
pip install scikit-learn
#coding:utf8
import nltk
from sklearn.feature_extraction.text import CountVectorizer
#加载两个文档
gb=nltk.corpus.gutenberg
hamlet=gb.raw("shakespeare-hamlet.txt")
macbeth=gb.raw("shakespeare-macbeth.txt")
#去除停用词并生成特征向量
cv=CountVectorizer(stop_words="english")
print "Feature Vector:",cv.fit_transform([hamlet,macbeth]).toarray()
Feature Vector: [[ 1 0 1 ..., 14 0 1]
[ 0 1 0 ..., 1 1 0]]
#coding:utf8
import nltk
import string
gb=nltk.corpus.gutenberg
sw=set(nltk.corpus.stopwords.words('english'))
words=gb.words("shakespeare-caesar.txt") #加载文档
punctuation=set(string.punctuation) #去除标点符号
filtered=[w.lower() for w in words if w.lower() not in sw and w.lower()not in punctuation]
fd=nltk.FreqDist(filtered) #词频统计
print "Words:",fd.keys()[:5]
print "Counts:",fd.values()[:5]
print "Max:",fd.max()
print "Count",fd['pardon']
#bigrams:对双字词进行统计分析
#trigrams:对三字词进行统计分析
fd=nltk.FreqDist(nltk.bigrams(filtered)) #对双字词进行统计分析
print "Bigrams:",fd.keys()[:5]
print "Counts:",fd.values()[:5]
print "Bigram Max:",fd.max()
print "Bigram Count",fd['decay', 'vseth']
Words: [u'fawn', u'writings', u'legacies', u'pardon', u'hats']
Counts: [1, 1, 1, 10, 1]
Max: caesar
Count 10
Bigrams: [(u'bru', u'must'), (u'bru', u'patient'), (u'angry', u'flood'), (u'decay', u'vseth'), (u'cato', u'braue')]
Counts: [1, 1, 1, 1, 1]
Bigram Max: (u'let', u'vs')
Bigram Count 1
#coding:utf8
import nltk
import string
import random
gb=nltk.corpus.gutenberg
sw=set(nltk.corpus.stopwords.words('english'))
punctuation=set(string.punctuation) #去除标点符号
def word_features(word): #计算单词长度
return {'len':len(word)}
def isStopWord(word): #判断是否是停用词
return word in sw or word in punctuation
words=gb.words("shakespeare-caesar.txt") #加载文档
labeled_words=([(word.lower(),isStopWord(word.lower())) for word in words])
random.seed(42)
random.shuffle(labeled_words) #元组随机排序
print labeled_words[:5]
featuresets=[(word_features(n),word) for (n,word) in labeled_words]
cutoff=int(.9*len(featuresets))
train_set,test_set=featuresets[:cutoff],featuresets[cutoff:] #划分训练集和测试集
classifier=nltk.NaiveBayesClassifier.train(train_set)
print "'behold' class:",classifier.classify(word_features('behold'))
print "'the' class:",classifier.classify(word_features('the'))
print "Accuracy:",nltk.classify.accuracy(classifier,test_set) #计算模型准确率
print classifier.show_most_informative_features(5) #查看哪些特征贡献较大
运行结果:
[(u'was', True), (u'greeke', False), (u'cause', False), (u'but', True), (u'house', False)]
'behold' class: False
'the' class: True
Accuracy: 0.857585139319
Most Informative Features
len = 7 False : True = 65.7 : 1.0
len = 1 True : False = 52.0 : 1.0
len = 6 False : True = 51.4 : 1.0
len = 5 False : True = 10.9 : 1.0
len = 2 True : False = 10.4 : 1.0
#coding:utf8
import random
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
import string
#使用categories
labeled_docs=[(list(movie_reviews.words(fid)),cat)
for cat in movie_reviews.categories()
for fid in movie_reviews.fileids(cat)]
random.seed(42)
random.shuffle(labeled_docs)
#print labeled_docs[:1]
review_words=movie_reviews.words()
print "# Review Words:",len(review_words)
sw=set(stopwords.words('english'))
punctuation=set(string.punctuation) #去除标点符号
def isStopWord(word): #判断是否是停用词
return word in sw or word in punctuation
filtered=[w.lower() for w in review_words if not isStopWord(w.lower())]
print "#After filter:",len(filtered) #去除停用词后的长度
words=FreqDist(filtered) #词频统计
N=int(0.05*len(words.keys()))
word_features=words.keys()[:N]
def doc_features(doc):
doc_words=FreqDist(w for w in doc if not isStopWord(w))
features={}
for word in word_features:
features['count (%s)'%word]=(doc_words.get(word,0))
return features
featuresets=[(doc_features(d),c) for (d,c) in labeled_docs]
train_set,test_set=featuresets[200:],featuresets[:200]
classifier=NaiveBayesClassifier.train(train_set)
print "Accuracy",accuracy(classifier,test_set)
print classifier.show_most_informative_features()
# Review Words: 1583820
#After filter: 710579
Accuracy 0.695
Most Informative Features
count (nature) = 2 pos : neg = 8.5 : 1.0
count (ugh) = 1 neg : pos = 8.2 : 1.0
count (sans) = 1 neg : pos = 8.2 : 1.0
count (effortlessly) = 1 pos : neg = 6.3 : 1.0
count (mediocrity) = 1 neg : pos = 6.2 : 1.0
count (dismissed) = 1 pos : neg = 5.8 : 1.0
count (wits) = 1 pos : neg = 5.8 : 1.0
count (also) = 6 pos : neg = 5.8 : 1.0
count (want) = 3 neg : pos = 5.5 : 1.0
count (caan) = 1 neg : pos = 5.5 : 1.0
#coding:utf8
import random
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
import string
sw=set(stopwords.words('english'))
punctuation=set(string.punctuation) #去除标点符号
def isStopWord(word): #判断是否是停用词
return word in sw or word in punctuation
review_words=movie_reviews.words()
filtered=[w.lower() for w in review_words if not isStopWord(w.lower())]
#print filtered
words=FreqDist(filtered) #词频统计
N=int(0.01*len(words.keys()))
tags=words.keys()[:N]
for tag in tags:
print tag,":",words[tag]
#coding:utf8
import random
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.corpus import names
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
import pandas as pd
import numpy as np
import string
sw=set(stopwords.words('english'))
punctuation=set(string.punctuation) #去除标点符号
all_names=set([name.lower() for name in names.words()]) #得到所有名字信息
def isStopWord(word): #判断是否是停用词 isalpha函数判断字符是否都是由字母组成
return (word in sw or word in punctuation or word in all_names or not word.isalpha())
review_words=movie_reviews.words()
filtered=[w.lower() for w in review_words if not isStopWord(w.lower())]
#print filtered
words=FreqDist(filtered) #词频统计
texts=[]
for fid in movie_reviews.fileids():
#print fid fid表示文件
texts.append(" ".join([w.lower() for w in movie_reviews.words(fid) if not isStopWord(w.lower()) and words[w.lower()]>1]))
vectorizer=TfidfVectorizer(stop_words='english')
matrix=vectorizer.fit_transform(texts) #计算TD-IDF
#print matrix
sums=np.array(matrix.sum(axis=0)).ravel() #每个单词的TF-IDF值求和,并将结果存在numpy数组
ranks=[]
#itertools.izip把不同的迭代器的元素聚合到一个迭代器中。类似zip()方法,但是返回的是一个迭代器而不是一个list
for word, val in itertools.izip(vectorizer.get_feature_names(),sums):
ranks.append((word,val))
df=pd.DataFrame(ranks,columns=["term","tfidf"])
df=df.sort_values(['tfidf'])
#print df.head()
N=int(0.01*len(df)) #得到排名靠前的1%
df=df.tail(N)
for term,tfidf in itertools.izip(df["term"].values,df["tfidf"].values):
print term,":",tfidf
# tags=words.keys()[:N]
# for tag in tags:
# print tag,":",words[tag]
#coding:utf8
import networkx as nx
#networkx提供许多示例图,可以列出
print [s for s in dir(nx) if s.endswith('graph')]
运行结果:
['LCF_graph', 'adjacency_graph', 'barabasi_albert_graph', 'barbell_graph', 'binomial_graph', 'bull_graph', 'caveman_graph', 'chordal_cycle_graph', 'chvatal_graph', 'circulant_graph', 'circular_ladder_graph', 'complete_bipartite_graph', 'complete_graph', 'complete_multipartite_graph', 'connected_caveman_graph', 'connected_watts_strogatz_graph', 'cubical_graph', 'cycle_graph', 'cytoscape_graph', 'davis_southern_women_graph', 'dense_gnm_random_graph', 'desargues_graph', 'diamond_graph', 'digraph', 'directed_havel_hakimi_graph', 'dodecahedral_graph', 'dorogovtsev_goltsev_mendes_graph', 'duplication_divergence_graph', 'edge_subgraph', 'ego_graph', 'empty_graph', 'erdos_renyi_graph', 'expected_degree_graph', 'extended_barabasi_albert_graph', 'fast_gnp_random_graph', 'florentine_families_graph', 'frucht_graph', 'gaussian_random_partition_graph', 'general_random_intersection_graph', 'geographical_threshold_graph', 'gn_graph', 'gnc_graph', 'gnm_random_graph', 'gnp_random_graph', 'gnr_graph', 'graph', 'grid_2d_graph', 'grid_graph', 'havel_hakimi_graph', 'heawood_graph', 'hexagonal_lattice_graph', 'hoffman_singleton_graph', 'house_graph', 'house_x_graph', 'hypercube_graph', 'icosahedral_graph', 'induced_subgraph', 'is_directed_acyclic_graph', 'jit_graph', 'joint_degree_graph', 'json_graph', 'k_random_intersection_graph', 'karate_club_graph', 'kl_connected_subgraph', 'krackhardt_kite_graph', 'ladder_graph', 'line_graph', 'lollipop_graph', 'make_max_clique_graph', 'make_small_graph', 'margulis_gabber_galil_graph', 'moebius_kantor_graph', 'multidigraph', 'multigraph', 'navigable_small_world_graph', 'newman_watts_strogatz_graph', 'node_link_graph', 'null_graph', 'nx_agraph', 'octahedral_graph', 'pappus_graph', 'partial_duplication_graph', 'path_graph', 'petersen_graph', 'planted_partition_graph', 'powerlaw_cluster_graph', 'projected_graph', 'quotient_graph', 'random_clustered_graph', 'random_degree_sequence_graph', 'random_geometric_graph', 'random_k_out_graph', 'random_kernel_graph', 'random_partition_graph', 'random_regular_graph', 'random_shell_graph', 'relabel_gexf_graph', 'relaxed_caveman_graph', 'scale_free_graph', 'sedgewick_maze_graph', 'star_graph', 'stochastic_graph', 'subgraph', 'tetrahedral_graph', 'to_networkx_graph', 'tree_graph', 'triad_graph', 'triangular_lattice_graph', 'trivial_graph', 'truncated_cube_graph', 'truncated_tetrahedron_graph', 'turan_graph', 'tutte_graph', 'uniform_random_intersection_graph', 'watts_strogatz_graph', 'waxman_graph', 'wheel_graph', 'windmill_graph']
G=nx.davis_southern_women_graph()
plt.figure(1)
a={}
a=dict(nx.degree(G))
plt.hist(a.values())
运行结果:
plt.figure(2)
pos=nx.spring_layout(G)
nx.draw(G,node_size=9)
nx.draw_networkx_labels(G,pos)
plt.show()
运行结果: