刚刚学会点爬虫,简单爬取必应壁纸,共106页。
import os
import time
from lxml import etree
import requests
base_url = 'https://bing.ioliu.cn/?p={}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
def download(url):
html = requests.get(url, headers=headers)
tree = etree.HTML(html.text)
img_urls = tree.xpath('//div[@class="container"]//img/@src')
titles = tree.xpath('//div[@class="container"]//div[@class="description"]/h3/text()')
i = 0
for img_url in img_urls:
img_content = requests.get(img_url, headers=headers).content
title = (titles[i]).split('(')[0]
title = title.strip()
i += 1
item = {'title': title, 'img': img_content}
yield item
time.sleep(2)
def save_img(item):
img_name = item['title'].replace(',', '_') + '.jpg'
content = item["img"]
with open('./必应壁纸/%s' % img_name, 'wb') as fp:
fp.write(content)
print(item['title'] + '储存完毕......')
print('=' * 50)
fp.close()
def main():
print("下载开始......")
print('*' * 50)
i = 1
while True:
if i > 106:
break
url = base_url.format(i)
if not os.path.exists('./必应壁纸'):
os.mkdir('./必应壁纸')
for item in download(url):
save_img(item)
i += 1
yes_no = input('是否继续下载下一页?是则输入1,否则按任意键退出!!!')
if yes_no == '1':
continue
else:
break
print('下载结束......')
print('*' * 50)
time.sleep(2)
import os
import time
from lxml import etree
import requests
base_url = 'https://bing.ioliu.cn/?p={}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
def download(url):
html = requests.get(url, headers=headers)
tree = etree.HTML(html.text)
img_urls = tree.xpath('//div[@class="container"]//img/@src')
titles = tree.xpath('//div[@class="container"]//div[@class="description"]/h3/text()')
i = 0
for img_url in img_urls:
img_content = requests.get(img_url, headers=headers).content
title = (titles[i]).split('(')[0]
title = title.strip()
i += 1
item = {'title': title, 'img': img_content}
yield item
time.sleep(2)
def save_img(item):
img_name = item['title'].replace(',', '_') + '.jpg'
content = item["img"]
with open('./必应壁纸/%s' % img_name, 'wb') as fp:
fp.write(content)
print(item['title'] + '储存完毕......')
print('=' * 50)
fp.close()
def main():
print("下载开始......")
print('*' * 50)
i = 1
while True:
if i > 106:
break
url = base_url.format(i)
if not os.path.exists('./必应壁纸'):
os.mkdir('./必应壁纸')
for item in download(url):
save_img(item)
i += 1
yes_no = input('是否继续下载下一页?是则输入1,否则按任意键退出!!!')
if yes_no == '1':
continue
else:
break
print('下载结束......')
print('*' * 50)
time.sleep(2)
if __name__ == '__main__':
main()
初次学习,如有改进之处,还请各位大佬指正!