from urllib import request from html.parser import HTMLParser import json class MovieParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.movies = [] def handle_starttag(self, tag, attrs): # print("attrs ", attrs) def _attr(attrlist,attrname): for attr in attrlist: if attr[0] == attrname: return attr[1] return None if tag == 'li' and _attr(attrs,'data-title') and _attr(attrs,'data-category') == 'nowplaying': movie = {} movie['title'] = _attr(attrs,'data-title') movie['score'] = _attr(attrs,'data-score') movie['director'] = _attr(attrs,'data-director') movie['actors'] = _attr(attrs,'data-actors') self.movies.append(movie) print('%(title)s| %(score)s| %(director)s| %(actors)s' % movie) def nowplaying(url): req = request.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36') s = request.urlopen(req).read() parser = MovieParser() parser.feed(s.decode('utf-8')) return parser.movies if __name__ == "__main__": url = "https://movie.douban.com/nowplaying/wuhan/" movies = nowplaying(url) print('%s' % json.dumps(movies, sort_keys=True, indent=4, separators=(',', ': ')))
对
html.parser不了解的可以看一下以下官方文档的解释(只取了一点)
HTMLParser.
handle_starttag
(
tag,
attrs
)
id="main">
).
-
The
tag
argument is the name of the tag converted to lower case. The
attrs
argument is a list of
(name, value)
pairs containing the attributes found inside the tag’s
<>
brackets. The
name
will be translated to lower case, and quotes in the
value
have been removed, and character and entity references have been replaced.
-
For instance, for the tag
HREF="https://www.cwi.nl/">
, this method would be called as
handle_starttag('a', [('href', 'https://www.cwi.nl/')])
.
-
All entity references from
html.entities
are replaced in the attribute values.
As a basic example, below is a simple HTML parser that uses the HTMLParser
class to print out start tags, end tags, and data as they are encountered:
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print("Encountered a start tag:", tag)
def handle_endtag(self, tag):
print("Encountered an end tag :", tag)
def handle_data(self, data):
print("Encountered some data :", data)
parser = MyHTMLParser()
parser.feed('Test '
'Parse me!
')
The output will then be:
Encountered a start tag: html
Encountered a start tag: head
Encountered a start tag: title
Encountered some data : Test
Encountered an end tag : title
Encountered an end tag : head
Encountered a start tag: body
Encountered a start tag: h1
Encountered some data : Parse me!
Encountered an end tag : h1
Encountered an end tag : body
Encountered an end tag : html