说明:本文代码参考:http://haofly.net/python3-get-qqalbum/
20150903更新:本文仍存在部分相册不能下载的问题,本文代码不再维护,请移步至:使用Python2.7和火狐浏览器下载QQ空间好友相册(二)
不过由于QQ空间后台更新,本文根据QQ空间最新的JSONAPI做了改进
首先需要下载windows版本的curl,具体可以参考百度经验
下载地址:http://curl.haxx.se/download/?C=M;O=D
如果你是64位系统,建议下载curl-7.33.0-win64-nossl.zip
然后解压到D盘根目录下,如图
此时,你可以将D:\curl-7.33.0-win64-nossl这个路径添加到环境变量中,也可以在python代码中添加
接下来从火狐浏览器中复制两个curl
打开火狐浏览器-->F12-->网络-->打开下面两个网页,右键复制cURL
http://user.qzone.qq.com/[QQ号码]/4,这个是相册的cRUL,注意找fcg_list_album_v3开头的地址点击某一个相册进入相片列表,找到相片的cURL,注意找cgi_list_photo开头的地址
先去掉地址中的压缩指令 --compressed(因为windows版本的CURL不支持压缩指令)
然后把地址用单引号括起来,然后复制到对应的代码中,如图另外,还需要添加一个创建一个文本文件qqlist.txt添加QQ号码,该文本文件位于python脚本的同一目录下,每个号码一行,也可以用软件抓
python2.7代码如下:(运行之前请务必安装好windows版本的cURL)
# -*- coding: UTF-8 -*- import os import re import subprocess import shlex import urllib2 import json import datetime import shutil import cookielib import Cookie # 添加curl的环境变量 os.putenv('PATH', 'D:\\curl-7.33.0-win64-nossl\\;' + os.getenv('PATH')) # 获取原始curl请求,相册fcg_list_album_v3相片cgi_list_photo origin_album = 'fcg_list_album_v3' origin_photo = 'cgi_list_photo' origin_album = origin_album.replace('--compressed ', '') origin_photo = origin_photo.replace('--compressed ', '') # ------------------------------构造cookies开始----------------------------------------------- cookiestr = origin_photo[origin_photo.find('"Cookie:'): origin_photo.find('"', -1)] cookie_str = cookiestr[cookiestr.find(':') + 2: cookiestr.find('"', 2)] print cookie_str cookie_domain = '.photo.store.qq.com' cookie_path = '/' simple_cookie = Cookie.SimpleCookie(cookie_str) # Parse Cookie from str cookiejar = cookielib.CookieJar() # No cookies stored yet for c in simple_cookie: cookie_item = cookielib.Cookie( version=0, name=c, value=str(simple_cookie[c].value), port=None, port_specified=None, domain=cookie_domain, domain_specified=None, domain_initial_dot=None, path=cookie_path, path_specified=None, secure=None, expires=None, discard=None, comment=None, comment_url=None, rest=None, rfc2109=False, ) cookiejar.set_cookie(cookie_item) # Apply each cookie_item to cookiejar opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) # Return opener # ------------------------------构造cookies结束----------------------------------------------- # 获取目标QQ fp = open('qqlist.txt', 'r') qqlist = fp.readlines() for i in range(len(qqlist)): qqlist[i] = qqlist[i][:-1] fp.close() for target in qqlist: if len(target) == 0 : # 防止因为出现空行删除所有照片 continue log = { } log['qq'] = target log['access'] = 1 # 是否允许访问 log['time'] = datetime.datetime.now() # 下载完成后记录花费的时间 log['album_count'] = 0 # 相册总数 log['photo_count'] = 0 # 照片总数 print('当前QQ:' + target) try: os.makedirs('photos/' + target) # 建立相应的文件夹 except: shutil.rmtree('photos/' + target) # 无论文件夹是否为空都移除该文件夹 os.makedirs('photos/' + target) # 先得到正确的curl,然后执行获取json数据 hostUin = origin_album.split('&hostUin=')[1].split('&')[0] curl = origin_album.replace(hostUin, target) # 替换被访问者 curl = curl.replace('&pageNumModeSort=40', '&pageNumModeSort=100') # 显示相册数量 args = shlex.split(curl) result = subprocess.check_output(args).decode('utf-8') jsonstr = result[result.find('(') + 1: result.find(')', -1) - 1] # json字符串,去除不标准的json数据 output = json.loads(jsonstr) # 最终json数据 if output['code'] == -3000: # 对不起,您尚未登录或者登录超时。 print output['message'] break if output['code'] == -4009: log['access'] = 0 # 是否允许访问 fp = open('photos/' + target + '/log.txt', 'w') # 日志文件,记录时间与数量 fp.writelines(str(log)) fp.close() continue # 相册没有分类时 print jsonstr albumList = [] if 'albumListModeSort' in output['data']: # 相册没有分类,目前这一类占大多数 print 'type1' if output['data']['albumListModeSort'] == 'null': print 'noalbum' break print u'第一个相册名称:' + output['data']['albumListModeSort'][0]['name'] # 输出第一个相册名称 albumList = output['data']['albumListModeSort'] else: # 相册有分类 print 'type2' print u'第一个相册名称:' + output['data']['albumListModeClass'][0]['albumList'][0]['name'] # 输出第一个相册名称 albumList = [dict() for i in range(0, output['data']['albumsInUser'])] count = 0 # 重新构造albumList for i in range(0, len(output['data']['albumListModeClass'])): for j in range(0, output['data']['albumListModeClass'][i]['totalInClass']): albumList[count] = output['data']['albumListModeClass'][i]['albumList'][j] count = count + 1 if count > output['data']['albumsInUser']: # 对跳出条件加强控制 break theSameAlbumName = 0 # 防止相册同名 for album in albumList: if not album: # 字典为空跳出,上面的output['data']['albumsInUser']是可访问相册数。 continue log['album_count'] += 1 print u'当前相册:' + str(album['classid']) + album['name'] if album['allowAccess'] == 0: # 相册无法直接访问(需要密码或者禁止访问) continue # album['id']就是照片列表的ID # 获取照片列表数据 hostUin = origin_photo.split('&hostUin=')[1].split('&')[0] topicId = origin_photo.split('&topicId=')[1].split('&')[0] curl = origin_photo.replace(hostUin, target) curl = curl.replace(topicId, album['id']) curl = curl.replace('&pageNum=30', '&pageNum=600') # QQ空间每个相册最大貌似不会超过512 args = shlex.split(curl) result = subprocess.check_output(args).decode('utf-8') jsonstr = result[result.find('(') + 1: result.find(')', -1) - 1] #json字符串 output = json.loads(jsonstr) #json字符串转字典 if (output['code'] == -4404): continue # 相册名里面会不会也有奇葩名字呢 filt = re.compile(r'\\|/|:|\*|\?|<|>|\||\.') album['name'] = re.sub(filt, '', album['name']) # 我服都服了,QQ空间居然还允许同名的相册。。。 albumname = str(album['classid']) + album['name'].replace(' ', '') filelist = os.listdir('photos/' + target + '/') temp = albumname.encode('gbk') # encode的作用是将unicode编码转换成其他编码的字符串,由于文件列表filelist里是gbk编码,保存一致才能比较 if (temp in filelist) or (len(albumname) == 0): # 编号,防止同名 albumname = albumname + '_' + str(theSameAlbumName) theSameAlbumName += 1 os.makedirs('photos/' + target + '/' + albumname) # 防止相片同名 same = 0 # 获取该相册下的每一张照片,如果相册为空,那么output['data']['photoList'] = None,output['data']['totalInAlbum']=0 photoList = output['data']['photoList'] if output['data']['totalInAlbum'] == 0: continue for photo in photoList: log['photo_count'] += 1 print(u'当前图片:' + photo['name']) # 图片格式由photo['phototype']字段(整型)控制 # 1:jpg # 3:png phototype = {'1': '.jpg', '2': '.gif', '3': '.png', '5': '.jpg', '10': '.jpg'} try: format = phototype[str(photo['phototype'])] except: format = '.jpg' # 建立文件夹并下载图片 # QQ图片里面有太多的特殊字符了 photoname = photo['name'] filelist = os.listdir('photos/' + target + '/' + albumname) for i in range(len(filelist)): filelist[i] = filelist[i][:-4] photoname = photoname.replace(' ', '') # 文件名中不能有特殊字符 filt = re.compile(r'\\|/|:|\*|\?|<|>|\||\.|\n|\t|\"') photoname = re.sub(filt, '', photoname) if (photoname in filelist) or (len(photoname) == 0): photoname = photoname + '_' + str(same) same += 1 path = 'photos\\' + target + '\\' + albumname + '\\' + photoname + format try: f = opener.open(photo['url']) with open(path, "wb") as code: code.write(f.read()) except urllib2.HTTPError as e: print('保存图片出错') fp = open('photos/' + target + '/log.txt', 'w') # 日志文件,记录时间与数量 log['time'] = (datetime.datetime.now() - log['time']).seconds log['time'] = str(log['time']) + 's' fp.writelines(str(log)) fp.close() print('当前QQ:' + target + '下载完毕')
下面是改进过程中用到的测试代码,有兴趣的可以看下
#-*- coding: UTF-8 -*- import os import shlex import subprocess import json # 添加curl的环境变量 os.putenv('PATH', 'D:\\curl-7.33.0-win64-nossl\\;'+os.getenv('PATH')) # 获取原始curl请求 origin_album = fcg_list_album_v3 origin_photo = cgi_list_photo #相册中的ID对应链接中的topicId; target=QQ号码 print target print origin_album hostUin=origin_album.split('&hostUin=')[1].split('&')[0] print hostUin # 先得到正确的curl,然后执行获取json数据 curl = origin_album.replace(hostUin, target) # 替换被访问者 curl = curl.replace('&pageNumModeSort=40', '&pageNumModeSort=100') # 显示相册数量 args = shlex.split(curl) result = subprocess.check_output(args).decode('utf-8') jsonstr=result[result.find('(') + 1 : result.find(')', -1) -1] #json字符串 print jsonstr output = json.loads(jsonstr) #json字符串转字典 print output['data']['albumListModeClass'][0]['albumList'][0]['name'] print output['data']['albumListModeClass'][0]['albumList'][1]['name'] print output['data']['albumListModeClass'][0]['albumList'][2]['name'] print output['data']['albumListModeClass'][0]['albumList'][3]['name'] print output['data']['albumListModeClass'][0]['totalInClass'] print output['data']['albumListModeClass'][1]['albumList'][0]['name'] print output['data']['albumListModeClass'][1]['albumList'][1]['name'] print output['data']['albumListModeClass'][1]['albumList'][2]['name'] print output['data']['albumListModeClass'][1]['albumList'][3]['name'] print output['data']['albumListModeClass'][1]['totalInClass'] print output['data']['albumListModeClass'][2]['albumList'][0]['name'] print output['data']['albumListModeClass'][2]['totalInClass'] print output['data']['albumsInUser'] print 'type' print output['data']['mode'] albumListModeClass = output['data']['albumListModeClass'] print '------------' for albumClass in albumListModeClass: albumList=albumClass['albumList'] for album in albumList: print u'当前相册:' + str(album['classid']) + album['name'] print '------------' #下面是photo #判断是否允许访问1为允许0为不允许 album = output['data']['albumListModeClass'][1]['albumList'][1] # print album['allowAccess'] #id与topicId对应 # print album['id'] #相册名 # print album['name'] hostUin=origin_photo.split('&hostUin=')[1].split('&')[0] topicId=origin_photo.split('&topicId=')[1].split('&')[0] curl = origin_photo.replace(hostUin, target) #替换链接里的被访问者账号 curl = curl.replace(topicId, album['id']) #替换链接里的相册ID curl = curl.replace('&pageNum=30', '&pageNum=600') # QQ空间每个相册最大貌似不会超过512 args = shlex.split(curl) result = subprocess.check_output(args).decode('utf-8') print result jsonstr=result[result.find('(') + 1 : result.find(')', -1) -1] #json字符串 print jsonstr output = json.loads(jsonstr) #json字符串转字典 photo = output['data']['photoList'][4] # print photo #print output['data']['totalInAlbum'] #相册里相片总张数 print output['data']['photoList'][4]['url']
解决了部分相册需要cookies的问题,json里面也没找到对应字段,所以把cookies全加上了,
cookies测试代码
#-*- coding: UTF-8 -*- import Cookie import urllib2 import os import shlex import cookielib import json # 添加curl的环境变量 os.putenv('PATH', 'D:\\curl-7.33.0-win64-nossl\\;'+os.getenv('PATH')) # 获取原始curl请求 origin_album = fcg_list_album_v3 origin_photo = cgi_list_photo cookiestr=origin_photo[origin_photo.find('"Cookie:') : origin_photo.find('"', -1) ] cookie_str = cookiestr[cookiestr.find(':') + 2 : cookiestr.find('"', 2) ] print cookie_str cookie_domain='.photo.store.qq.com' cookie_path='/' simple_cookie = Cookie.SimpleCookie(cookie_str) # Parse Cookie from str cookiejar = cookielib.CookieJar() # No cookies stored yet for c in simple_cookie: cookie_item = cookielib.Cookie( version=0, name=c, value=str(simple_cookie[c].value), port=None, port_specified=None, domain=cookie_domain, domain_specified=None, domain_initial_dot=None, path=cookie_path, path_specified=None, secure=None, expires=None, discard=None, comment=None, comment_url=None, rest=None, rfc2109=False, ) cookiejar.set_cookie(cookie_item) # Apply each cookie_item to cookiejar opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) # Return opener url=realurl #图片实际地址 savepath='D:\\test.jpg' f = opener.open(url) with open(savepath, "wb") as code: code.write(f.read())
后记:
20150718:修正第二类型相册分类数始终为3的错误
20150829:不用再手工删除 --compressed指令