HTMLParser

#coding:utf-8

import urllib2
from HTMLParser import HTMLParser
 
class MyHTMLParser(HTMLParser):
    def __init__(self, get_tag, key_word, key_value, key_properby):
        HTMLParser.__init__(self)
        self.get_tag = get_tag
        self.key_word = key_word
        self.key_value = key_value
        self.key_properby = key_properby
        self.links = []
    def handle_starttag(self, tag, attrs):
        #print "Encountered the beginning of a %s tag" % tag
        if tag == self.get_tag:
            if len(attrs) == 0: pass
            else:
                attrs = dict(attrs)
                for variable, value in attrs.iteritems():
                    if attrs.get(self.key_word) == self.key_value:
                        if variable == self.key_properby:
                                self.links.append(value)

def source_url_get():
    url = 'http://v.youku.com/v_vpfoldervideolist/page_1_id_55814819_f_5316052_o_1_p_9.html?__rt=1&__ro=vpfoldervideolist'
    html_code = urllib2.urlopen(url).read()
    
    hp = MyHTMLParser('a', 'charset', '5-1', 'href')
    hp.feed(html_code)
    hp.close()
    for link in hp.links:
        url = "http://www.flvcd.com/parse.php?flag=&format=&kw=%s" % (link)
        html_code = urllib2.urlopen(url).read()
        hp = MyHTMLParser('a', 'onclick', '_alert();return false;', 'href')
        hp.feed(html_code)
        hp.close()
        print '\n'.join(hp.links)
    
if __name__ == "__main__":
    source_url_get()
    
    
    


你可能感兴趣的:(HTMLParser)