网易云陈奕迅所有专辑封面爬取

参考网页:
Python爬虫小白入门(六)爬取披头士乐队历年专辑封面-网易云音乐
selenium_python

项目内容:
这个项目是抓取网易云音乐上面陈奕迅所有专辑,主要是组装参考网页1的项目,加上一点异步IO而已

遇到问题:
1.以下是异步http获取文本内容,如果是获取二进制文件用下面的方法,官方文档

async with session.get('https://api.github.com/events') as resp:
    print(await resp.text())
print(await resp.read())

项目源码:

from selenium import webdriver
from bs4 import BeautifulSoup
import asyncio,aiohttp
import os


USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
CHENYIXUN_ALL_ALBUM_URL = 'https://music.163.com/#/artist/album?id=2116&limit=108&offset=0'
PATH = 'CHENYIXUN'


def mkdir(path):
    path = path.strip()
    if os.path.exists(path):
        print('文件夹已经存在')
        return False
    else:
        print('创建文件夹成功')
        os.mkdir(path)
        return True

async def save_img(url,img_name,sem):
    # 要模拟浏览器登陆
    headers = {'User-Agent': USER_AGENT}
    with (await sem):
        async with aiohttp.ClientSession() as client:
            async with client.get(url, headers=headers) as resp:
                assert resp.status == 200
                img = await resp.read()
                with open(img_name,'wb') as f:
                    print('成功下载文件{img_name}并保存'.format(img_name=img_name))
                    f.write(img)


def download_img(download_imgs_urls):
    # 设置线程的信号量,最多5个协程在工作,根据网站的流量或者实际测试确定
    # 如果没有进行限制,那么中途可能被封IP
    sem = asyncio.Semaphore(5)
    loop = asyncio.get_event_loop()
    tasks = [save_img(url,img_name,sem) for img_name,url in download_imgs_urls.items()]
    loop.run_until_complete(asyncio.wait(tasks))
    loop.close()


def main():
    # 创建/读取文件夹
    mkdir(PATH)
    files_list = os.listdir(PATH)
    os.chdir(PATH)

    driver = webdriver.Chrome()
    driver.get(CHENYIXUN_ALL_ALBUM_URL)
    driver.switch_to.frame('g_iframe')
    html = driver.page_source

    # 链接处理
    download_imgs_urls = {}
    all_li = BeautifulSoup(html,'lxml').find(id='m-song-module').find_all('li')
    for li in all_li:
        album_img = li.find('img')['src']
        album_img_url = album_img.split('?')[0]

        album_name = li.find('div',class_='u-cover u-cover-alb3')['title']
        album_data = li.find('span').get_text()

        img_name = '{album_data}-{album_name}.jpg'.format(album_data = album_data,album_name = album_name.replace('/','').replace(':','').replace('?','').replace('\\','').replace('\"',''))

        if img_name in files_list:
            print('图片已经存在,不再下载')
        else:
            # print('{img_name}-{album_img_url}'.format(img_name=img_name,album_img_url=album_img_url))
            download_imgs_urls[img_name] = album_img_url
            files_list.append(img_name)

    # 最后异步下载
    download_img(download_imgs_urls)


if __name__ == '__main__':
    main()

你可能感兴趣的:(网易云陈奕迅所有专辑封面爬取)