爬取豆瓣网正在上映电影信息(HTMLParser实现)

from urllib import request
from html.parser import HTMLParser
import json
class MovieParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.movies = []
    def handle_starttag(self, tag, attrs):
        # print("attrs  ", attrs)
        def _attr(attrlist,attrname):
            for attr in attrlist:
                if attr[0] == attrname:
                    return attr[1]
            return None

        if tag == 'li' and _attr(attrs,'data-title') and _attr(attrs,'data-category') == 'nowplaying':
            movie = {}
            movie['title'] = _attr(attrs,'data-title')
            movie['score'] = _attr(attrs,'data-score')
            movie['director'] = _attr(attrs,'data-director')
            movie['actors'] = _attr(attrs,'data-actors')
            self.movies.append(movie)
            print('%(title)s| %(score)s| %(director)s| %(actors)s' % movie)



def nowplaying(url):
    req = request.Request(url)
    req.add_header('User-Agent',
                  'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')
    s = request.urlopen(req).read()
    parser = MovieParser()
    parser.feed(s.decode('utf-8'))
    return parser.movies


if __name__ == "__main__":
    url = "https://movie.douban.com/nowplaying/wuhan/"
    movies = nowplaying(url)

    print('%s' % json.dumps(movies, sort_keys=True, indent=4, separators=(',', ': ')))
 
  
 
  
 
  
html.parser不了解的可以看一下以下官方文档的解释(只取了一点)
 
  
HTMLParser. handle_starttag ( tag, attrs ) 
This method is called to handle the start of a tag (e.g. id="main"> ).
The tag argument is the name of the tag converted to lower case. The attrs argument is a list of (name, value) pairs containing the attributes found inside the tag’s <> brackets. The name will be translated to lower case, and quotes in the value have been removed, and character and entity references have been replaced.
For instance, for the tag HREF="https://www.cwi.nl/"> , this method would be called as handle_starttag('a', [('href', 'https://www.cwi.nl/')]) .
All entity references from html.entities are replaced in the attribute values.

As a basic example, below is a simple HTML parser that uses the HTMLParser class to print out start tags, end tags, and data as they are encountered:

from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        print("Encountered a start tag:", tag)

    def handle_endtag(self, tag):
        print("Encountered an end tag :", tag)

    def handle_data(self, data):
        print("Encountered some data  :", data)

parser = MyHTMLParser()
parser.feed('Test'
            '

Parse me!

'
)

The output will then be:

Encountered a start tag: html
Encountered a start tag: head
Encountered a start tag: title
Encountered some data  : Test
Encountered an end tag : title
Encountered an end tag : head
Encountered a start tag: body
Encountered a start tag: h1
Encountered some data  : Parse me!
Encountered an end tag : h1
Encountered an end tag : body
Encountered an end tag : html

你可能感兴趣的:(python,爬虫)