';
items = re.findall(pattern , context)
;
for item
in items:
print(
"模特---" + item[
2] +
",年龄:" + item[
3] +
",地址:" + item[
4])
getNumberPattern = re.compile(
r
'\d+
')
;
mm_id = getNumberPattern.findall(item[
0])[
0]
;
print(mm_id)
;
self.startSaveUser(mm_id , item[
2])
;
#进一步获取信息,并保存
def
startSaveUser(
self ,
mm_id ,
name):
albumsUrl =
"https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=" + mm_id +
"&page=1"
;
albumContent = urllib.request.urlopen(albumsUrl)
;
albumContent = albumContent.read().decode(
"gbk")
;
# print(albumContent);
# 开始获取h4标签,用来拿到相册
res =
r
'(.
*?
)'
;
albums = re.findall(res , albumContent , re.S|re.M)
;
for oneAlbum
in albums:
# print(oneAlbum);
# 获取相册地址
urlRes =
r
"(?<=
href=\")
.+?(?=
\")|(?<=
href=\')
.+?(?=
\')
"
;
oneAlbumUrl = re.findall(urlRes , oneAlbum , re.S|re.M)[
0].strip()
;
print(oneAlbumUrl)
;
# 获取相册名字
albumName = re.findall(nameRes , oneAlbum , re.S|re.M)[
0].strip()
;
albumName = albumName.replace(
"." ,
"")
;
# 开始创建文件夹 模特名字/相册名字
path = name +
"/" +albumName
;
self.createDir(path)
;
# 根据相册地址请求所有的图片
self.getAllImages(mm_id , oneAlbumUrl , path)
;
# 根据相册地址请求图片的方法
def
getAllImages(
self ,
userId,
oneAlbumUrl ,
path):
oneAlbumUrl =
"http://" + oneAlbumUrl
;
result = urllib.parse.urlparse(oneAlbumUrl)
;
param = urllib.parse.parse_qs(result.query ,
True)
;
albumId = param[
"album_id"][
0]
;
imagesUrl =
"https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=" + userId +
"&album_id=" + albumId +
"&top_pic_id=0&page=1&callback=jsonp254"
;
# 请求相册地址
imagesResult = urllib.request.urlopen(imagesUrl)
;
imagesResult = imagesResult.read().decode(
"gbk")
;
imagesResult = imagesResult.replace(
"\r" ,
"").replace(
"\n" ,
"").replace(
"\t" ,
"")
;
imagesResult = imagesResult.split(
"(")[
1].split(
")")[
0]
;
imagesData = json.loads(imagesResult)
;
for index , onImage
in
enumerate(imagesData[
"picList"]):
oneImageUrl =
"http:" + onImage[
"picUrl"]
;
self.saveImageByUrl(oneImageUrl , index , path)
;
def
saveImageByUrl(
self ,
imgUrl ,
imageName ,
path):
path = path +
"/" +
str(imageName) +
".jpg"
;
isExist = os.path.exists(path)
;
if isExist:
print(path +
"已经存在了")
;
return
;
else:
print(
"开始保存" + path)
;
imagesData = urllib.request.urlopen(imgUrl).read()
;
f =
open(path ,
"wb")
;
f.write(imagesData)
;
f.close()
;
def
createDir(
self ,
path):
path = path.strip()
;
isExist = os.path.exists(path)
;
if isExist:
print(
"该路径已经存在,不用创建了")
;
else:
# print("正在创建路径");
os.makedirs(path)
;
s = Spider(
"https://mm.taobao.com/json/request_top_list.htm")
;
s.getUrlContent(
1 ,
2)
;