大家可以打开堆糖网,不需要登录,然后搜索蜜桃猫,就会发现许多可爱的蜜桃猫和他的小对象,我们这次的任务就是要爬取蜜桃猫的日常生活。下面是爬取的代码,我们先提取了每个图片的链接,然后进行了下载。由于搜索到的结果是Ajax,所以需要我们通过json库来进行爬取。然后用正则表达式来为每个图片创建自己的名称。最后进行保存。
import requests
import re
from requests.exceptions import MissingSchema
import json
url = []
def get_page():
url = 'https://www.duitang.com/napi/blog/list/by_search/?kw=%E8%9C%9C%E6%A1%83%E7%8C%AB&type=feed&start=0&limit=100'
try:
resp = requests.get(url)
#print(url)
if 200 == resp.status_code:
#print(resp.json())
return( resp.json())
except requests.ConnectionError:
return None
def get_images(json):
if json.get('data'):
data = json.get('data')
#print(data)
object_list = data.get('object_list')
for list in object_list:
image2 = list.get("photo")
images = image2.get("path")
save_image(images)
def save_image(url):
response = requests.get(url)
picture = response.content
"https://b-ssl.duitang.com/uploads/item/201903/13/20190313105834_vtvmt.jpeg"
abc = re.match("^http.*?_(.*?)$",str(url))
#print("aa")
#print(abc.group(1))
num = abc.group(1)
path = "d:/py/duitangmitaomao/p/"+str(num)
with open(path, "wb") as f:
f.write(picture)
def main():
json = get_page()
get_images(json)
save_image(url)
main()