正则表达式用来匹配HTML标签有一个最大的优势,就是速度快。但是正则表达式的写法较为复杂,而且在页面结构发生改变的时候就会出问题。正则表达式通常用来过滤数据,提取我们想要的数据。
在返回的页面中找到需要的代码块
'''
1
9.6
'''
分析得到正则表达式
paanter = re.compile(
r'.*?index-\d+">(\d+).*?data-src="(.*?)".*?(.*?).*?(.*?)
.*?releasetime">(.*?).*?"integer">(.*?).*?fraction">(\d)',
re.S)
import requests
设置请求头可以在自己浏览器的headers里面查找复制即可。
# -*- coding:utf-8 -*-
import requests,re,json
class SpiderMaoYan(object):
'''猫眼电影top100爬虫'''
def __init__(self):
self.start_url = "http://maoyan.com/board/4?offset={}"
self.headers ={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
}
def get_url(self):
#1.构建url列表
self.url_list = [self.start_url.format(i*10) for i in range(10)]
def send_request(self):
#2.发送请求获取页面内容
for url in self.url_list:
response = requests.get(url=url,headers=self.headers)
html = response.content.decode()
#进行数据存储
self.save_data(html)
def save_data(self,html):
#3.提取页面数据
paanter = re.compile(
r'.*?index-\d+">(\d+).*?data-src="(.*?)".*?(.*?).*?(.*?)
.*?releasetime">(.*?).*?"integer">(.*?).*?fraction">(\d)',
re.S)
data_list = paanter.findall(html)
for item in data_list:
data_dact = dict(
number= item[0],
image_url = item[1],
name = item[2],
actor = item[3].strip(),
releasetime = item[4],
score=item[5]+item[6],
)
json_data = json.dumps(data_dact,ensure_ascii=False)
with open('maoyan.json','a',encoding='utf8') as f:
f.write(json_data+'\n')
def run(self):
#控制爬虫启动的函数
self.get_url()
self.send_request()
if __name__ == '__main__':
maoyan = SpiderMaoYan()
maoyan.run()