python爬虫学习之爬取某图片网站

0x00:使用xpath进行网页解析

#coding: utf-8
import requests
import os
import re
from lxml import etree
import time

def get_title(title):  #获取标题,创建文件
    path=r"./Pic/"+title

    if os.path.exists(path):   #文件夹存在,返回
        return path
    else:
        os.makedirs(path)   #创建空文件夹
        return path

def pic_get(info):#下载图片
    url = info['url']
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
        "Referer": url
    }
    store_path=info['path']
    all=info['all']
    print("获取"+store_path+"******************")
    for i in range (1,all+1):
        i_str=str(i)
        finall_url=url+"/"+i_str
        response=requests.get(finall_url,headers=header)
        data=response.content.decode('utf-8')
        try:
            html=etree.HTML(data)
            img_url=html.xpath("//div[@class=\"main-image\"]//img")[0].xpath("./@src")[0]
            response=requests.get(img_url,headers=header)
            if response.status_code==200:
                data=response.content
                with open(store_path+"/"+i_str+'.jpg',"wb+") as fp:
                    fp.write(data)
                fp.close()
                print(img_url)
            time.sleep(0.5)
        except:
            pass
    return

def url_create(url_path,type):
    #主url产生
    if type=='main_url':
        print("正在获取全部可访问页面....")
        parser=etree.HTMLParser(encoding="utf-8")
        html=etree.parse(url_path,parser)
        num=html.xpath("//div[@class=\"nav-links\"]/a[4]")[0].xpath('text()')[0]

        main_url=[]
        for i in range(1,int(num)-10):
            tmp_url="https://www.xxxx.com/tag/xxx/page/"+str(i)
            main_url.append(tmp_url)
        return main_url


    #图片url获取
    if type=='pic_url':
        header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
        }
        response=requests.get(url_path+"/",headers=header)
        data=response.content.decode("utf-8")
        html=etree.HTML(data)
        lis=html.xpath("//ul[@id=\"pins\"]/li/span/a")

        pic_info=[]
        for li in lis:
            tmp_url=li.xpath("./@href")[0]
            title=li.xpath("text()")[0]
            pre_rul=r"[:,.<>'\":]"
            title=re.sub(pre_rul,'-',title)
            path=get_title(title)  #创建文件夹
            info={
                "path":path,
                "url":tmp_url
            }
            pic_info.append(info)
        return pic_info


    #查看每一个主题可以下载多少图片
    if type=='title_url':
        header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
        }
        response = requests.get(url_path+"/1", headers=header)
        data = response.content.decode("utf-8")
        html = etree.HTML(data)
        all = html.xpath("//div[@class=\"pagenavi\"]/a/span")[4].xpath("text()")[0]
        return int(all)


def main():
    #首先访问主页获取基本参数信息
    url="https://www.xxxxxx.com/tag/xxxxxx/"
    header={
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
    }
    response=requests.get(url,headers=header)
    data=response.content.decode("utf-8")
    with open(r"./1.html","w+",encoding="utf-8") as fp:
        fp.write(data)
    fp.close()

    #调用链接生成函数,生成可操作链接
    url_path=r"./1.html"
    main_url=url_create(url_path,'main_url')   #获取所有可访问页面
    time.sleep(1)

    #进入每一页面,获取当前页的全部可访问图片链接
    pic_url=[]
    for page_url in main_url:
        tmp_url=url_create(page_url,'pic_url')
        pic_url.append(tmp_url)
    #print(pic_url)
    time.sleep(1)   #避免操作过快,服务器拒绝响应

    #处理获取的信息
    for first in pic_url:
        for seconde in first:
            all=url_create(seconde['url'],"title_url")
            seconde['all']=all
            time.sleep(0.5)
    print("全部信息获取完毕,开始下载图片!!!!\n")
    print(pic_url)

    for first in pic_url:
        for seconde in first:
            pic_get(seconde)
            time.sleep(0.5)

if __name__ == '__main__':
    main()

0x01:使用正则表达式进行网页数据获取:

#-*-coding:utf-8 -*-
import re
import requests
from multiprocessing import Pool
import time
import os

def get_Pic(url):
    print(url)
    header = {
        "Referer": url,
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    response = requests.get(url, headers=header)
    data = response.content.decode()
    title = re.findall(r'(.*?)',data)
    Pic_url=re.findall(r'',data,re.DOTALL)
    max=re.findall(r'(.*?)',data,re.DOTALL)

    #创建存储文件夹
    path="./Pic/"+title[0]
    if os.path.exists(path):
        print("图片存储位置:"+path)
        pass
    else:
        print("成功创建存储文件夹"+path)
        os.makedirs(path)
    #############

    #开始下载图片
    for i in range(1,int(max[0])+1):
        if i<10:
            i_str="0"+str(i)
        else:
            i_str=str(i)
        pic_url=Pic_url[0][:-6]+i_str+".jpg"
        print("开始下载"+pic_url)
        try:
            response=requests.get(pic_url,headers=header)
            store_path=path+"/"+i_str+".jpg"
            with open(store_path,"wb+") as fp:
                fp.write(response.content)
            fp.close()
            time.sleep(0.5)
        except:
            print(pic_url+"下载失败,下载下一张")
            pass
    return


def get_Url(url):
    header={
        "Referer": url,
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    response=requests.get(url,headers=header)
    data=response.content.decode()
    all_url=re.findall(r"https://www.xxxxxx.com/\d{4,6}",data)
    return list(set(all_url))#去重之后返回

def get_wight():
    print("页数区间实例:4-10,爬行第四页到第十页。")
    in_ = input("请输入想爬行的页数区间(页数过多可能导致服务停止,最大10页):")
    wight = re.findall(r".*(\d{1,2}).(\d{1,2}).*", in_, re.DOTALL)
    if wight == []:
        print("爬行区间输入有误!")
        exit(0)
    else:
        (start, end) = wight[0]
        start = int(start)
        end = int(end)
        if start <= 0 or start > end:
            print("请重新输入爬行区间。")
            exit(0)
        elif end > 230:
            print("末区间超过最大页数。")
            exit(0)
        elif end - start > 10:
            print("区间间隔过大,请重新输入。")
            exit(0)
    return (start,end)

def main():
    (start,end)=get_wight()
    urls=[]
    for i in range(start,end+1):
        i_str=str(i)
        url="https://www.xxxxx.com/page/%s/"% i_str
        #print(url)
        url_list=get_Url(url)
        time.sleep(1)  #休眠一秒,避免访问速度过快
        urls.append(url_list)

    pool=Pool(15)  #创建进程池
    for url_list in urls:
        for url in url_list:
            next_one=pool.apply_async(get_Pic,args=(url,))
            time.sleep(0.5)
        next_one.wait()

    print("等待全部子进程结束")
    pool.close()
    pool.join()
    print("图片下载完成")

if __name__ == '__main__':
    main()

end:

之前看到有一堆人都爬过,刚好学了爬虫,来试试手,中间遇到了一些坑,还是很有成长的,继续加油!

你可能感兴趣的:(python学习)