网页中文词语抓取+词云显示

下面用到三个重要的库:
1,urllib2:用于访问网页,通过urlopen方法获取网页内容;
2,sgmllib:通过其中的SGMLPraser对网页内容进行分析,此处需要重载SGMLPraser基类,并将网页内容喂给其;
3,wordcloud:通过WordCloud配置词云,通过其generate创建次云;
4,matplotlib:用于显示词云;

# -*- coding:utf-8-*-
import urllib2
from sgmllib import SGMLParser

__author__ = 'niliang'

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 绘制词云
def draw_wordcloud(wordlist):
    #color_mask = imread("Anne_Hathaway.png") # 读取背景图片
    cloud = WordCloud(
        font_path="/usr/share/fonts/truetype/droid/DroidSansJapanese.ttf",
        background_color='black',
        #mask=color_mask,
        max_words=len(wordlist.split(' ')),#4100,
        max_font_size=60,
        min_font_size=5
    )
    word_cloud = cloud.generate(wordlist) # 产生词云
    #word_cloud.to_file("pjl_cloud4.jpg") #保存图片
    #  显示词云图片
    plt.imshow(word_cloud)
    plt.axis('off')
    plt.show()


def is_ustr(in_str):
    for i in range(len(in_str)):
        if not is_uchar(in_str[i]):
            return False
    return in_str
def is_uchar(uchar):
    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
            return True
    #if uchar >= u'\u0030' and uchar<=u'\u0039':
    #        return True        
    #if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
    #        return True
    #if uchar in ('/'):
    #        return True
    return False

class ListName(SGMLParser):
        def __init__(self):
                SGMLParser.__init__(self)
                self.is_a = ""
                self.name = []
        def start_a(self, attrs):
                self.is_a = 1
        def end_a(self):
                self.is_a = ""
        def handle_data(self, text):
                if self.is_a == 1:
                        self.name.append(text)

if __name__ == '__main__':
        content = urllib2.urlopen('http://list.taobao.com/browse/cat-0.htm').read()
        listname = ListName()
        listname.feed(content)
        itemset_original=set(listname.name)
        itemset_unicode=[]
        for item in itemset_original:
                tmp = item.decode('utf8')
                tmp = tmp.strip(' ')
                if(is_ustr(tmp)):
                        itemset_unicode.append(tmp) 

        #for item in itemset_unicode:
                #print item.encode('gbk')
        itemstr=''
        for item in itemset_unicode:
            itemstr+=item
            itemstr+=' '
        print itemstr
        draw_wordcloud(itemstr)

你可能感兴趣的:(python)