爬取壁纸网站上图片

最近试试用python写点爬虫,正巧在思否上看到有人分享的一个爬取壁纸网站上合适尺寸壁纸的爬虫,感觉挺有意思就模仿着人家自己也写了一下试试。感觉顺着自己的思路写下来,也还行。满足了按照分类来和页数来实现爬取的功能。也算是提升了一下自己的爬虫技巧,从爬文字升级到爬图片了^_^~~
先上原贴地址:sf.gg上 jrainlau 的作品
附上壁纸网站:http://wallpaperswide.com(感觉这个网站上的壁纸质量挺好的,不过是个境外网站,连接不是特别稳定,下载速度有点慢)
下边是代码

#-*- coding:utf-8 -*-
import time
import os
import http.client
import requests
import csv
import socket
from bs4 import BeautifulSoup
import random

#if len(sys.argv) !=4:#原计划是直接写成输入爬取范围的参数。但是测试有点不方便就改成了直接赋值。如果要独立运行的话这几行去掉注释即可
#    print('3 arguments were required but only find'+str(len(sys.argv)-1))
#    exit()
category='girls'
#try:
page_start=[1]
page_end=3
#except :
#    print("The second and thrid argument must be numbers! ")
#    exit()

def get_content(url,data=None):
    header={
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Cache-Control':'max-age=0',
        'Connection':'keep-alive',
        'Cookie':'__qca=P0-688669869-1517470897283; PHPSESSID=c7f4efc14caffc86d3d8388479d1f9bf; ae74935a9f5bd890e996f9ae0c7fe805=q5vS1ldKBFw%3Ddb8MgaQtfy8%3D8%2F3JbRJ6Fes%3Db%2BwMrVK8eHs%3Daa0wj%2BrGoS4%3DlopdREWA8%2B4%3DRZtLP5MGPM4%3Dj9VIfn7XpyI%3D',
        'Host':'wallpaperswide.com',
        'Referer':'http://wallpaperswide.com/aero-desktop-wallpapers.html/page/1',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }
    timeout=random.choice(range(80,100))

    while True:
        try:
            rep=requests.get(url,headers=header,timeout=timeout)
            rep.encoding='utf-8'
            break

        except socket.timeout as e:
            print("3:",e)
            time.sleep(random.choice(8,15))

        except socket.error as e:
            print("4:",e)
            time.sleep(random.choice(20,40))    

        except http.client.BadStatusLine as e:
            print("5:",e)
            time.sleep(random.choice(30,80))

        except http.client.IncompleteRead as e:
            print("6:",e)

    return  rep.text

def get_paper_link(html):#获取壁纸下载连接,我电脑是16:9的屏幕
    bs=BeautifulSoup(html,'html.parser')
    body=bs.body
    div=body.find('div',{'id':'wallpaper-resolutions'})
    h3=div.find_all('h3')
    for h in h3:
        if h.string == 'HD 16:9':
            links=div.find('a',title='HD 16:9 1600 x 900 wallpaper')
            if links:
                link_url=links.get("href")
                link_name=links.get("href").replace('/download/','')
#                print(link_url)

    time.sleep(random.choice(range(5,10)))
    return link_url,link_name

def get_wall_paper_link(html):#获取从缩略图到下载界面的链接
    url_links=[]
    body=BeautifulSoup(html,'html.parser').body
    bef_div=body.find('div',{'id':'content'})
    ul=bef_div.find('ul')
    lis=ul.find_all('li',{'class':'wall'})
    for li in lis:
        if li:
            div=li.find('div',{'id':'hudtitle'})
            url_links.append(div.find('a').get('href'))
    return url_links

def start():
    page_domain='http://wallpaperswide.com'
    page_url='http://wallpaperswide.com/'+category+'-desktop-wallpapers/page/'

    #if page_start[0]<=page_end:
    while page_start[0] <= page_end:#从起始页到尾页循环起来gogogogo~~~~
        send_wall_url=page_url+str(page_start[0])
        page_resource=get_content(send_wall_url)
        wallpages_urls=get_wall_paper_link(page_resource)
        page_start[0]=page_start[0]+1

        for wallpage_url in wallpages_urls:
            html=get_content(page_domain+wallpage_url)
            image_url,image_name=get_paper_link(html)
            down_image(image_url,image_name)

def down_image(url,name):
    web_host="http://wallpaperswide.com"
    result_image=requests.get(web_host+url)
    if result_image.status_code==200:
        open('E:\\python\\vscode\\spider\\images\\'+name,'wb').write(result_image.content)
        time.sleep(random.choice(range(5,10)))
        print("download %s is over"%name)


if __name__=='__main__':
#    url="http://wallpaperswide.com/lonely_woman-wallpapers.html"
#    html=get_content(url)
#    image_url,image_name=get_paper_link(html)
#    down_image(image_url,image_name)
    start()

全程使用requests库,没有用到正则也没有用到原贴中的select。
这个算是写完了。等回家研究研究某榴,试试从某榴论坛上爬一下。写好了也放上来给大家分享!!

你可能感兴趣的:(python,爬虫)