爬取ZOL壁纸(低配版,后面将持续更新)

 

此次爬取,只是初步爬取,还有很多地方需要优化,后面会持续更新。

此次使用的解析库是pyquery,还可以使用Xpath、BeautifulSoup等解析库。大家相互交流,相互学习,有不足之处请不吝指教。

话不多说,直接上代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from pyquery import PyQuery as pq
import os

base_url = 'http://desk.zol.com.cn'

"""
函数功能:获取该组图片所有的第一层url,返回图片组标题
"""
def get_url(id, path):
    #url拼接
    url = base_url + '/bizhi/' + id + '_2.html'
    #html解析,分离url
    response = requests.get(url)
    doc = pq(response.text)
    titlename = doc('#titleName').text()
    response = doc('#showImg a').items()
    if not os.path.exists(path+'/'+titlename + '/'):
        os.makedirs(path+'/'+titlename + '/')
    #将url保存到txt文件中
    for item in response:
        with open(path+'/'+titlename + '/'+titlename+'.txt', 'a') as f:
            f.write(item.attr('href') + '\n')

    return titlename

"""
函数功能:打开图片第一层url,获取第二层url,并返回
"""
def get_one_page(url):
    url = base_url + url

    response = requests.get(url)
    doc = pq(response.text)('#tagfbl a').attr('href')

    # result = doc('#tagfbl a').attr('href')
    return doc

"""
函数功能:获取图片的下载url
"""
def get_down_url(url):
    down_url = base_url + url

    response = requests.get(down_url)
    #选取第一组分辨率的图片
    doc = pq(response.text)('img').attr('src')

    # result = doc('img').attr('src')

    return doc

"""
函数功能:保存图片
"""
def save_one_page(url, filename):
    response = requests.get(url)

    with open(filename, 'wb') as f:
        f.write(response.content)

if __name__ == '__main__':

    #ids保存的是图片的id,这个id是我自定义的,比如:http://desk.zol.com.cn/bizhi/7299_90272_2.html,我将其id定义为7299_90272
    ids = ['5245_64923', '466_4194', '4033_49930', '208_2515', '774_8742', '4732_58907', '5220_64609',
           '1857_23392', '5022_62256', '5202_64388']

    for id in ids:
        path = './ZOLwallpaper/'
        titlename = get_url(id, path)
        pic_path = path+'/'+titlename + '/'
        #判断路径是否存在
        if not os.path.exists(pic_path):
            os.makedirs(pic_path)

        f = open(path+'/'+titlename + '/'+titlename+'.txt')
        #i为计数器
        i = 0
        for line in f.readlines():
            file_name = path+'/'+titlename + '/' + str(i) + '.jpg'
            # print(line)
            # get_one_page(line)
            # get_down_url(get_one_page(line))
            save_one_page(get_down_url(get_one_page(line)), file_name)
            print('正在爬取:', file_name)
            i += 1
        f.close()
        print(id, ':爬取完成')
    print('所有图片爬取完成')





 

你可能感兴趣的:(python爬虫,爬取图片)