下面用到三个重要的库:
1,urllib2:用于访问网页,通过urlopen方法获取网页内容;
2,sgmllib:通过其中的SGMLPraser对网页内容进行分析,此处需要重载SGMLPraser基类,并将网页内容喂给其;
3,wordcloud:通过WordCloud配置词云,通过其generate创建次云;
4,matplotlib:用于显示词云;
# -*- coding:utf-8-*-
import urllib2
from sgmllib import SGMLParser
__author__ = 'niliang'
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 绘制词云
def draw_wordcloud(wordlist):
#color_mask = imread("Anne_Hathaway.png") # 读取背景图片
cloud = WordCloud(
font_path="/usr/share/fonts/truetype/droid/DroidSansJapanese.ttf",
background_color='black',
#mask=color_mask,
max_words=len(wordlist.split(' ')),#4100,
max_font_size=60,
min_font_size=5
)
word_cloud = cloud.generate(wordlist) # 产生词云
#word_cloud.to_file("pjl_cloud4.jpg") #保存图片
# 显示词云图片
plt.imshow(word_cloud)
plt.axis('off')
plt.show()
def is_ustr(in_str):
for i in range(len(in_str)):
if not is_uchar(in_str[i]):
return False
return in_str
def is_uchar(uchar):
if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
return True
#if uchar >= u'\u0030' and uchar<=u'\u0039':
# return True
#if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
# return True
#if uchar in ('/'):
# return True
return False
class ListName(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.is_a = ""
self.name = []
def start_a(self, attrs):
self.is_a = 1
def end_a(self):
self.is_a = ""
def handle_data(self, text):
if self.is_a == 1:
self.name.append(text)
if __name__ == '__main__':
content = urllib2.urlopen('http://list.taobao.com/browse/cat-0.htm').read()
listname = ListName()
listname.feed(content)
itemset_original=set(listname.name)
itemset_unicode=[]
for item in itemset_original:
tmp = item.decode('utf8')
tmp = tmp.strip(' ')
if(is_ustr(tmp)):
itemset_unicode.append(tmp)
#for item in itemset_unicode:
#print item.encode('gbk')
itemstr=''
for item in itemset_unicode:
itemstr+=item
itemstr+=' '
print itemstr
draw_wordcloud(itemstr)