爬取校花网校花图片

import requests
import re
import os
#设置下载路径
picture_path = '/Downloads/project1/图片/'

#向url发起请求,如果状态值=200表示成功
def get_page(url):
   response = requests.get(url)
   if response.status_code == 200:
       return response.text

#将url解析,自动创建文件夹
def get_path(index_page):
    file_path = re.findall(r'(.*?)', index_page)
    file_path = str(file_path).strip("[']").split('|')[0]
    new_path = os.path.join(picture_path, file_path)
    if not os.path.exists(new_path):
        os.mkdir(new_path)
    return new_path

#将url解析,自动获取需要到URL
def parse_index(index_page):
    urls = re.findall('href="(.*?)"',index_page,re.S)
    for url in urls:
        if url.startswith('/d'):
            url = 'http://www.xiaohuar.com'+url
            yield url

#将url解析后的图片存入自动的创建文件夹
def get_picture(new_path,detail_page):
    try:
        response = requests.get(detail_page)
        if response.status_code == 200:
            name = detail_page.split('/')[-1]
            filepath = '%s/%s'%(new_path,name)
            with open(filepath,'wb') as f:
                f.write(response.content)
            print('%s 下载成功'%detail_page)

    except Exception as e:
        print(e)


def main():
#需要解析的URL
    base_url = 'http://www.xiaohuar.com/s-1-19{page_num}.html#p1'
#计数下载的图片
    count = 0
    for i in range(1,99):
        url = base_url.format(page_num=i)
        index_page = get_page(url)
        new_path = get_path(index_page)
        detail_urls = parse_index(index_page)
        for detail_url in detail_urls:
            count+=1
            get_picture(new_path,detail_url)

    print('共爬取到%s张相关图片' % count)


if __name__ == '__main__':
    main()

你可能感兴趣的:(爬取校花网校花图片)