初学爬虫,简单爬取必应壁纸

刚刚学会点爬虫,简单爬取必应壁纸,共106页。

一、思路分析

用到的库:

import os
import time
from lxml import etree
import requests

1.获取壁纸内容

base_url = 'https://bing.ioliu.cn/?p={}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}


def download(url):
    html = requests.get(url, headers=headers)
    tree = etree.HTML(html.text)
    img_urls = tree.xpath('//div[@class="container"]//img/@src')
    titles = tree.xpath('//div[@class="container"]//div[@class="description"]/h3/text()')
    i = 0
    for img_url in img_urls:
        img_content = requests.get(img_url, headers=headers).content
        title = (titles[i]).split('(')[0]
        title = title.strip()
        i += 1
        item = {'title': title, 'img': img_content}
        yield item
    time.sleep(2)

2.保存图片

def save_img(item):
    img_name = item['title'].replace(',', '_') + '.jpg'
    content = item["img"]
    with open('./必应壁纸/%s' % img_name, 'wb') as fp:
        fp.write(content)
    print(item['title'] + '储存完毕......')
    print('=' * 50)
    fp.close()

3.主函数

def main():
    print("下载开始......")
    print('*' * 50)
    i = 1
    while True:
        if i > 106:
            break
        url = base_url.format(i)
        if not os.path.exists('./必应壁纸'):
            os.mkdir('./必应壁纸')
        for item in download(url):
            save_img(item)
        i += 1
        yes_no = input('是否继续下载下一页?是则输入1,否则按任意键退出!!!')
        if yes_no == '1':
            continue
        else:
            break

    print('下载结束......')
    print('*' * 50)
    time.sleep(2)

完整代码

import os
import time
from lxml import etree
import requests


base_url = 'https://bing.ioliu.cn/?p={}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}


def download(url):
    html = requests.get(url, headers=headers)
    tree = etree.HTML(html.text)
    img_urls = tree.xpath('//div[@class="container"]//img/@src')
    titles = tree.xpath('//div[@class="container"]//div[@class="description"]/h3/text()')
    i = 0
    for img_url in img_urls:
        img_content = requests.get(img_url, headers=headers).content
        title = (titles[i]).split('(')[0]
        title = title.strip()
        i += 1
        item = {'title': title, 'img': img_content}
        yield item
    time.sleep(2)


def save_img(item):
    img_name = item['title'].replace(',', '_') + '.jpg'
    content = item["img"]
    with open('./必应壁纸/%s' % img_name, 'wb') as fp:
        fp.write(content)
    print(item['title'] + '储存完毕......')
    print('=' * 50)
    fp.close()


def main():
    print("下载开始......")
    print('*' * 50)
    i = 1
    while True:
        if i > 106:
            break
        url = base_url.format(i)
        if not os.path.exists('./必应壁纸'):
            os.mkdir('./必应壁纸')
        for item in download(url):
            save_img(item)
        i += 1
        yes_no = input('是否继续下载下一页?是则输入1,否则按任意键退出!!!')
        if yes_no == '1':
            continue
        else:
            break

    print('下载结束......')
    print('*' * 50)
    time.sleep(2)


if __name__ == '__main__':
    main()

初次学习,如有改进之处,还请各位大佬指正!

你可能感兴趣的:(python)