python 抓去淘宝相册并分别下载

import too
import urllib.request
import re
import os
import json
class Spider:
def __init__( self , siteUrl):
self.siteUrl = siteUrl ;
self.tool = too.Tool() ;

def getUrlContent( self , start , end):
# 打开网络地址
for i in range(start , end + 1):
myUrl = self.siteUrl + "?page=" + str(i) ;
context = urllib.request.urlopen(myUrl) ;
context = context.read() ;
self.getContent(context) ;



def getContent( self , context):
context = context.decode( "gbk") ;

pattern = re.compile(
'
;
items = re.findall(pattern , context) ;

for item in items:
print( "模特---" + item[ 2] + ",年龄:" + item[ 3] + ",地址:" + item[ 4])

getNumberPattern = re.compile( r '\d+ ') ;
mm_id = getNumberPattern.findall(item[ 0])[ 0] ;
print(mm_id) ;

self.startSaveUser(mm_id , item[ 2]) ;
#进一步获取信息,并保存
def startSaveUser( self , mm_id , name):
albumsUrl = "https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=" + mm_id + "&page=1" ;
albumContent = urllib.request.urlopen(albumsUrl) ;
albumContent = albumContent.read().decode( "gbk") ;

# print(albumContent);
# 开始获取h4标签,用来拿到相册
res = r '

(.*? )

'
;
albums = re.findall(res , albumContent , re.S|re.M) ;

for oneAlbum in albums:
# print(oneAlbum);
# 获取相册地址
urlRes = r "(?<= href=\") .+?(?= \")|(?<= href=\') .+?(?= \') " ;
oneAlbumUrl = re.findall(urlRes , oneAlbum , re.S|re.M)[ 0].strip() ;
print(oneAlbumUrl) ;



# 获取相册名字
nameRes = r '*? >(.*? )' ;

albumName = re.findall(nameRes , oneAlbum , re.S|re.M)[ 0].strip() ;
albumName = albumName.replace( "." , "") ;
# 开始创建文件夹 模特名字/相册名字
path = name + "/" +albumName ;
self.createDir(path) ;

# 根据相册地址请求所有的图片
self.getAllImages(mm_id , oneAlbumUrl , path) ;

# 根据相册地址请求图片的方法
def getAllImages( self , userId, oneAlbumUrl , path):
oneAlbumUrl = "http://" + oneAlbumUrl ;
result = urllib.parse.urlparse(oneAlbumUrl) ;
param = urllib.parse.parse_qs(result.query , True) ;
albumId = param[ "album_id"][ 0] ;
imagesUrl = "https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=" + userId + "&album_id=" + albumId + "&top_pic_id=0&page=1&callback=jsonp254" ;

# 请求相册地址
imagesResult = urllib.request.urlopen(imagesUrl) ;
imagesResult = imagesResult.read().decode( "gbk") ;
imagesResult = imagesResult.replace( "\r" , "").replace( "\n" , "").replace( "\t" , "") ;
imagesResult = imagesResult.split( "(")[ 1].split( ")")[ 0] ;

imagesData = json.loads(imagesResult) ;

for index , onImage in enumerate(imagesData[ "picList"]):
oneImageUrl = "http:" + onImage[ "picUrl"] ;
self.saveImageByUrl(oneImageUrl , index , path) ;

def saveImageByUrl( self , imgUrl , imageName , path):
path = path + "/" + str(imageName) + ".jpg" ;

isExist = os.path.exists(path) ;
if isExist:
print(path + "已经存在了") ;
return ;
else:
print( "开始保存" + path) ;
imagesData = urllib.request.urlopen(imgUrl).read() ;
f = open(path , "wb") ;
f.write(imagesData) ;
f.close() ;




def createDir( self , path):
path = path.strip() ;
isExist = os.path.exists(path) ;

if isExist:
print( "该路径已经存在,不用创建了") ;
else:
# print("正在创建路径");
os.makedirs(path) ;





s = Spider( "https://mm.taobao.com/json/request_top_list.htm") ;

s.getUrlContent( 1 , 2) ;


你可能感兴趣的:(Python)