python爬取并分类保存到文件夹

分析

网站相对比较简单,用re正则表达式进行分析即可,图片用了水印,去水印的方法是将图片第一个.jpg后面的代码去掉

import re
import requests
import os

#获取网页
def ImageURL(url):
    img_dic={
     }
    content=requests.get(url).content.decode("utf-8")
    img_name = re.findall("\"(.*?)\", content)
    img_address = re.findall(".*?class=\"lj-lazy\",content)
    for i in range(len(img_name)):
        img_dic[img_name[i]]=img_address[i]
    return img_dic

#获取户型图网页
def ImageURL2(url):
    content = requests.get(url).content.decode("utf-8")
    img_url = re.findall(",content)
    return img_url

#获取房源图
def ImagGet(item,dir_path,img_url):
    with open(dir_path+"\\"+item+".jpg","wb") as pic:
        print("-------------", "正在下载:",item, "-------------")
        pic.write(requests.get(img_url).content)
        print("-------------", "下载完毕:",item, "-------------")


#获取户型图
def ImagGet2(img_url,path):
    url="https://hui.fang.ke.com//"+img_url
    content=requests.get(url).text
    img_name = re.findall(", content)
    img_address=re.findall(",content)
    img_address_xiaoguotu=re.findall(",content)
    for i in range(0,len(img_address)):
        with open(path+"\\"+img_name[i]+".jpg","wb") as f:
            print("-------------","正在下载:",img_name[i],"-------------")
            f.write(requests.get(img_address[i]).content)
            print("-------------", "下载完毕:",img_name[i], "-------------")
    for j in range(len(img_address_xiaoguotu)):
        if len(img_name)<1:
            path_img=path+"\\"+str(j)+".jpg"
        else:
            path_img=path+"\\"+img_name[0]+str(j)+".jpg"
        with open(path_img,"wb") as pic:
            print("-------------", "正在下载:", img_name[0], "-------------")
            pic.write(requests.get(img_address_xiaoguotu[j]).content)
            print("-------------", "下载完毕:", img_name[0], "-------------")


#创建文件夹
def CreatDir(dir_path):
    path = dir_path.strip()
    path = path.rstrip("\\")
    isExists = os.path.exists(path)

    if not isExists:
        os.makedirs(path)

path="E:\\pic\\"  #图片保存的文件地址
#dir_path="E:\\pic\\图片"
#CreatDir(dir_path)
#img_url=ImageURL2("https://hui.fang.ke.com//loupan//pg1")
#ImagGet(img_url,"E:\\pic\\")

for i in range(1,101):
    url = "https://hui.fang.ke.com/loupan/pg"+str(i)
    #每页的资源
    img_dic = ImageURL(url)
    img_url=ImageURL2(url)
    index=0
    for item in img_dic:
        dir_path=path.rstrip("\\")
#        CreatDir(dir_path)
        ImagGet(item, dir_path,img_dic[item])
        ImagGet2(img_url[index], dir_path)
        index+=1
print("-------------", "下载结束", "-------------")

你可能感兴趣的:(python)