Python爬取猫眼电影案例

 1 from urllib import request
 2 from urllib import parse
 3 import time
 4 import re
 5 import pymysql
 6 
 7 class MaoyanSpider(object):
 8     def __init__(self):
 9         self.baseurl = 'https://maoyan.com/board/4?offset='
10         self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
11         # 爬取页数计数
12         self.page = 1
13         # 创建2个对象
14         self.db = pymysql.connect(
15             'localhost','root','123456','spider',
16             charset='utf8'
17         )
18         self.cursor = self.db.cursor()
19 
20 
21     # 获取页面
22     def get_page(self,url):
23         req = request.Request(url,headers=self.headers)
24         res = request.urlopen(req)
25         html = res.read().decode('utf-8')
26         # 直接调用解析函数
27         self.parse_page(html)
28 
29     # 解析页面
30     def parse_page(self,html):
31         # 正则解析
32         p = re.compile('
.*?title="(.*?)".*?class="star">(.*?)

.*?releasetime">(.*?)

',re.S) 33 r_list = p.findall(html) 34 # r_list : [('霸王别姬','张国荣','1993'),(),()] 35 self.write_page(r_list) 36 37 # 保存数据(存到mysql数据库) 38 def write_page(self,r_list): 39 ins = 'insert into film(name,star,time) \ 40 values(%s,%s,%s)' 41 for rt in r_list: 42 film_list = [ 43 rt[0].strip(), 44 rt[1].strip(), 45 rt[2].strip()[5:15] 46 ] 47 48 self.cursor.execute(ins,film_list) 49 # 提交到数据库执行 50 self.db.commit() 51 52 # 主函数 53 def main(self): 54 # 用range函数可获取某些查询参数的值 55 for offset in range(0,41,10): 56 url = self.baseurl + str(offset) 57 self.get_page(url) 58 print('第%d页爬取成功' % self.page) 59 self.page += 1 60 time.sleep(1) 61 # 等所有页面爬完后再关闭 62 self.cursor.close() 63 self.db.close() 64 65 if __name__ == '__main__': 66 spider = MaoyanSpider() 67 spider.main()

 

转载于:https://www.cnblogs.com/OmySql/p/10796199.html

你可能感兴趣的:(Python爬取猫眼电影案例)