from urllib import request from html.parser import HTMLParser import json class MovieParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.movies = [] def handle_starttag(self, tag, attrs): # print("attrs ", attrs) def _attr(attrlist,attrname): for attr in attrlist: if attr[0] == attrname: return attr[1] return None if tag == 'li' and _attr(attrs,'data-title') and _attr(attrs,'data-category') == 'nowplaying': movie = {} movie['title'] = _attr(attrs,'data-title') movie['score'] = _attr(attrs,'data-score') movie['director'] = _attr(attrs,'data-director') movie['actors'] = _attr(attrs,'data-actors') self.movies.append(movie) print('%(title)s| %(score)s| %(director)s| %(actors)s' % movie) def nowplaying(url): req = request.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36') s = request.urlopen(req).read() parser = MovieParser() parser.feed(s.decode('utf-8')) return parser.movies if __name__ == "__main__": url = "" movies = nowplaying(url) print('%s' % json.dumps(movies, sort_keys=True, indent=4, separators=(',', ': ')))
argument is the name of the tag converted to lower case. The
argument is a list of
(name, value)
pairs containing the attributes found inside the tag’s
brackets. The
will be translated to lower case, and quotes in the
have been removed, and character and entity references have been replaced.
For instance, for the tag
, this method would be called as
handle_starttag('a', [('href', '')])
All entity references from
are replaced in the attribute values.
As a basic example, below is a simple HTML parser that uses the HTMLParser
class to print out start tags, end tags, and data as they are encountered:
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print("Encountered a start tag:", tag)
def handle_endtag(self, tag):
print("Encountered an end tag :", tag)
def handle_data(self, data):
print("Encountered some data :", data)
parser = MyHTMLParser()
parser.feed('Test '
'Parse me!
The output will then be:
Encountered a start tag: html
Encountered a start tag: head
Encountered a start tag: title
Encountered some data : Test
Encountered an end tag : title
Encountered an end tag : head
Encountered a start tag: body
Encountered a start tag: h1
Encountered some data : Parse me!
Encountered an end tag : h1
Encountered an end tag : body
Encountered an end tag : html