使用Python2.7和火狐浏览器下载QQ空间好友相册

说明:本文代码参考:http://haofly.net/python3-get-qqalbum/

20150903更新:本文仍存在部分相册不能下载的问题,本文代码不再维护,请移步至:使用Python2.7和火狐浏览器下载QQ空间好友相册(二)

不过由于QQ空间后台更新,本文根据QQ空间最新的JSONAPI做了改

首先需要下载windows版本的curl,具体可以参考百度经验

下载地址:http://curl.haxx.se/download/?C=M;O=D

如果你是64位系统,建议下载curl-7.33.0-win64-nossl.zip

然后解压到D盘根目录下,如图

使用Python2.7和火狐浏览器下载QQ空间好友相册_第1张图片

此时,你可以将D:\curl-7.33.0-win64-nossl这个路径添加到环境变量中,也可以在python代码中添加

接下来从火狐浏览器中复制两个curl

打开火狐浏览器-->F12-->网络-->打开下面两个网页,右键复制cURL

http://user.qzone.qq.com/[QQ号码]/4,这个是相册的cRUL,注意找fcg_list_album_v3开头的地址

点击某一个相册进入相片列表,找到相片的cURL,注意找cgi_list_photo开头的地址

使用Python2.7和火狐浏览器下载QQ空间好友相册_第2张图片

先去掉地址中的压缩指令 --compressed(因为windows版本的CURL不支持压缩指令)

然后把地址用单引号括起来,然后复制到对应的代码中,如图

另外,还需要添加一个创建一个文本文件qqlist.txt添加QQ号码,该文本文件位于python脚本的同一目录下,每个号码一行,也可以用软件抓

python2.7代码如下:(运行之前请务必安装好windows版本的cURL)

# -*- coding: UTF-8 -*-
import os
import re
import subprocess
import shlex
import urllib2
import json
import datetime
import shutil
import cookielib
import Cookie

# 添加curl的环境变量
os.putenv('PATH', 'D:\\curl-7.33.0-win64-nossl\\;' + os.getenv('PATH'))

# 获取原始curl请求,相册fcg_list_album_v3相片cgi_list_photo
origin_album = 'fcg_list_album_v3'
origin_photo = 'cgi_list_photo'
origin_album = origin_album.replace('--compressed ', '')
origin_photo = origin_photo.replace('--compressed ', '')
# ------------------------------构造cookies开始-----------------------------------------------

cookiestr = origin_photo[origin_photo.find('"Cookie:'): origin_photo.find('"', -1)]
cookie_str = cookiestr[cookiestr.find(':') + 2: cookiestr.find('"', 2)]
print cookie_str
cookie_domain = '.photo.store.qq.com'
cookie_path = '/'

simple_cookie = Cookie.SimpleCookie(cookie_str)  # Parse Cookie from str
cookiejar = cookielib.CookieJar()  # No cookies stored yet
for c in simple_cookie:
    cookie_item = cookielib.Cookie(
        version=0, name=c, value=str(simple_cookie[c].value),
        port=None, port_specified=None,
        domain=cookie_domain, domain_specified=None, domain_initial_dot=None,
        path=cookie_path, path_specified=None,
        secure=None,
        expires=None,
        discard=None,
        comment=None,
        comment_url=None,
        rest=None,
        rfc2109=False,
    )
    cookiejar.set_cookie(cookie_item)  # Apply each cookie_item to cookiejar
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))  # Return opener

# ------------------------------构造cookies结束-----------------------------------------------
# 获取目标QQ
fp = open('qqlist.txt', 'r')
qqlist = fp.readlines()
for i in range(len(qqlist)):
    qqlist[i] = qqlist[i][:-1]
fp.close()

for target in qqlist:
    if len(target) == 0 :  # 防止因为出现空行删除所有照片
        continue
    log = { }
    log['qq'] = target
    log['access'] = 1  # 是否允许访问
    log['time'] = datetime.datetime.now()  # 下载完成后记录花费的时间
    log['album_count'] = 0  # 相册总数
    log['photo_count'] = 0  # 照片总数
    print('当前QQ:' + target)
    try:
        os.makedirs('photos/' + target)  # 建立相应的文件夹
    except:
        shutil.rmtree('photos/' + target)  # 无论文件夹是否为空都移除该文件夹
        os.makedirs('photos/' + target)
        # 先得到正确的curl,然后执行获取json数据
    hostUin = origin_album.split('&hostUin=')[1].split('&')[0]
    curl = origin_album.replace(hostUin, target)  # 替换被访问者
    curl = curl.replace('&pageNumModeSort=40', '&pageNumModeSort=100')  # 显示相册数量
    args = shlex.split(curl)
    result = subprocess.check_output(args).decode('utf-8')
    jsonstr = result[result.find('(') + 1: result.find(')', -1) - 1]  # json字符串,去除不标准的json数据
    output = json.loads(jsonstr)  # 最终json数据
    if  output['code'] == -3000:  # 对不起,您尚未登录或者登录超时。
        print output['message']
        break
    if output['code'] == -4009:
        log['access'] = 0  # 是否允许访问
        fp = open('photos/' + target + '/log.txt', 'w')  # 日志文件,记录时间与数量
        fp.writelines(str(log))
        fp.close()
        continue
    # 相册没有分类时
    print jsonstr
    albumList = []
    if 'albumListModeSort' in output['data']:  # 相册没有分类,目前这一类占大多数
        print 'type1'
        if output['data']['albumListModeSort'] == 'null':
            print 'noalbum'
            break
        print u'第一个相册名称:' + output['data']['albumListModeSort'][0]['name']  # 输出第一个相册名称
        albumList = output['data']['albumListModeSort']
    else:  # 相册有分类
        print 'type2'
        print u'第一个相册名称:' + output['data']['albumListModeClass'][0]['albumList'][0]['name']  # 输出第一个相册名称
        albumList = [dict() for i in range(0, output['data']['albumsInUser'])]
        count = 0
        # 重新构造albumList
        for i in range(0, len(output['data']['albumListModeClass'])):
            for j in range(0, output['data']['albumListModeClass'][i]['totalInClass']):
                albumList[count] = output['data']['albumListModeClass'][i]['albumList'][j]
                count = count + 1
            if count > output['data']['albumsInUser']:  # 对跳出条件加强控制
                break
    theSameAlbumName = 0  # 防止相册同名
    for album in albumList:
        if not album:  # 字典为空跳出,上面的output['data']['albumsInUser']是可访问相册数。
            continue
        log['album_count'] += 1
        print u'当前相册:' + str(album['classid']) + album['name']
        if album['allowAccess'] == 0:  # 相册无法直接访问(需要密码或者禁止访问)
            continue
        # album['id']就是照片列表的ID
        # 获取照片列表数据
        hostUin = origin_photo.split('&hostUin=')[1].split('&')[0]
        topicId = origin_photo.split('&topicId=')[1].split('&')[0]
        curl = origin_photo.replace(hostUin, target)
        curl = curl.replace(topicId, album['id'])
        curl = curl.replace('&pageNum=30', '&pageNum=600')  # QQ空间每个相册最大貌似不会超过512
        args = shlex.split(curl)
        result = subprocess.check_output(args).decode('utf-8')
        jsonstr = result[result.find('(') + 1: result.find(')', -1) - 1]  #json字符串
        output = json.loads(jsonstr)  #json字符串转字典
        if (output['code'] == -4404):
            continue
        # 相册名里面会不会也有奇葩名字呢
        filt = re.compile(r'\\|/|:|\*|\?|<|>|\||\.')
        album['name'] = re.sub(filt, '', album['name'])
        # 我服都服了,QQ空间居然还允许同名的相册。。。
        albumname = str(album['classid']) + album['name'].replace(' ', '')
        filelist = os.listdir('photos/' + target + '/')
        temp = albumname.encode('gbk')  # encode的作用是将unicode编码转换成其他编码的字符串,由于文件列表filelist里是gbk编码,保存一致才能比较
        if (temp in filelist) or (len(albumname) == 0):  # 编号,防止同名
            albumname = albumname + '_' + str(theSameAlbumName)
            theSameAlbumName += 1
        os.makedirs('photos/' + target + '/' + albumname)
        # 防止相片同名
        same = 0
        # 获取该相册下的每一张照片,如果相册为空,那么output['data']['photoList'] = None,output['data']['totalInAlbum']=0
        photoList = output['data']['photoList']
        if output['data']['totalInAlbum'] == 0:
            continue
        for photo in photoList:
            log['photo_count'] += 1
            print(u'当前图片:' + photo['name'])
            # 图片格式由photo['phototype']字段(整型)控制
            # 1:jpg
            # 3:png
            phototype = {'1': '.jpg', '2': '.gif', '3': '.png', '5': '.jpg', '10': '.jpg'}
            try:
                format = phototype[str(photo['phototype'])]
            except:
                format = '.jpg'
            # 建立文件夹并下载图片
            # QQ图片里面有太多的特殊字符了
            photoname = photo['name']
            filelist = os.listdir('photos/' + target + '/' + albumname)
            for i in range(len(filelist)):
                filelist[i] = filelist[i][:-4]
            photoname = photoname.replace(' ', '')
            # 文件名中不能有特殊字符
            filt = re.compile(r'\\|/|:|\*|\?|<|>|\||\.|\n|\t|\"')
            photoname = re.sub(filt, '', photoname)
            if (photoname in filelist) or (len(photoname) == 0):
                photoname = photoname + '_' + str(same)
                same += 1
            path = 'photos\\' + target + '\\' + albumname + '\\' + photoname + format
            try:
                f = opener.open(photo['url'])
                with open(path, "wb") as code:
                    code.write(f.read())
            except urllib2.HTTPError as e:
                print('保存图片出错')
    fp = open('photos/' + target + '/log.txt', 'w')
    # 日志文件,记录时间与数量
    log['time'] = (datetime.datetime.now() - log['time']).seconds
    log['time'] = str(log['time']) + 's'
    fp.writelines(str(log))
    fp.close()
    print('当前QQ:' + target + '下载完毕')


 

开始用urllib.urlretrieve()下载的时候速度极慢,经过验证之后证实是HTTP协议版本的问题urllib使用0.9和1.0版本,换urllib2之后果然快多了

下面是改进过程中用到的测试代码,有兴趣的可以看下

#-*- coding: UTF-8 -*-

import os
import shlex
import subprocess
import json

# 添加curl的环境变量
os.putenv('PATH', 'D:\\curl-7.33.0-win64-nossl\\;'+os.getenv('PATH'))
# 获取原始curl请求
origin_album = fcg_list_album_v3
origin_photo = cgi_list_photo


#相册中的ID对应链接中的topicId;
target=QQ号码

print target
print origin_album

hostUin=origin_album.split('&hostUin=')[1].split('&')[0]
print hostUin
# 先得到正确的curl,然后执行获取json数据
curl = origin_album.replace(hostUin, target)    # 替换被访问者
curl = curl.replace('&pageNumModeSort=40', '&pageNumModeSort=100')          # 显示相册数量
args = shlex.split(curl)
result = subprocess.check_output(args).decode('utf-8')
jsonstr=result[result.find('(') + 1 : result.find(')', -1) -1]     #json字符串

print jsonstr
output = json.loads(jsonstr)                 #json字符串转字典
print output['data']['albumListModeClass'][0]['albumList'][0]['name']
print output['data']['albumListModeClass'][0]['albumList'][1]['name']
print output['data']['albumListModeClass'][0]['albumList'][2]['name']
print output['data']['albumListModeClass'][0]['albumList'][3]['name']
print output['data']['albumListModeClass'][0]['totalInClass']
print output['data']['albumListModeClass'][1]['albumList'][0]['name']
print output['data']['albumListModeClass'][1]['albumList'][1]['name']
print output['data']['albumListModeClass'][1]['albumList'][2]['name']
print output['data']['albumListModeClass'][1]['albumList'][3]['name']
print output['data']['albumListModeClass'][1]['totalInClass']
print output['data']['albumListModeClass'][2]['albumList'][0]['name']
print output['data']['albumListModeClass'][2]['totalInClass']
print output['data']['albumsInUser']
print 'type'
print output['data']['mode']

albumListModeClass = output['data']['albumListModeClass']
print '------------'
for albumClass in albumListModeClass:   
    albumList=albumClass['albumList']
    for album in albumList:
        print u'当前相册:' + str(album['classid']) + album['name']
        print '------------'

#下面是photo

#判断是否允许访问1为允许0为不允许


album = output['data']['albumListModeClass'][1]['albumList'][1]
#
print album['allowAccess']
#id与topicId对应
#
print album['id']
#相册名
#
print album['name']
hostUin=origin_photo.split('&hostUin=')[1].split('&')[0] 
topicId=origin_photo.split('&topicId=')[1].split('&')[0]

curl = origin_photo.replace(hostUin, target)   #替换链接里的被访问者账号

curl = curl.replace(topicId, album['id'])      #替换链接里的相册ID

curl = curl.replace('&pageNum=30', '&pageNum=600')  # QQ空间每个相册最大貌似不会超过512

args = shlex.split(curl)

result = subprocess.check_output(args).decode('utf-8')
print result
jsonstr=result[result.find('(') + 1 : result.find(')', -1) -1]     #json字符串
print jsonstr

output = json.loads(jsonstr)                 #json字符串转字典
photo = output['data']['photoList'][4]
#

print photo
#print output['data']['totalInAlbum'] #相册里相片总张数

print output['data']['photoList'][4]['url']

解决了部分相册需要cookies的问题,json里面也没找到对应字段,所以把cookies全加上了,

cookies测试代码

#-*- coding: UTF-8 -*-
import Cookie
import urllib2
import os
import shlex
import cookielib
import json
# 添加curl的环境变量
os.putenv('PATH', 'D:\\curl-7.33.0-win64-nossl\\;'+os.getenv('PATH'))
# 获取原始curl请求
origin_album = fcg_list_album_v3
origin_photo = cgi_list_photo

cookiestr=origin_photo[origin_photo.find('"Cookie:') : origin_photo.find('"', -1) ]
cookie_str = cookiestr[cookiestr.find(':') + 2 : cookiestr.find('"', 2) ]
print cookie_str
cookie_domain='.photo.store.qq.com'
cookie_path='/'

simple_cookie = Cookie.SimpleCookie(cookie_str)    # Parse Cookie from str
cookiejar = cookielib.CookieJar()    # No cookies stored yet
for c in simple_cookie:
    cookie_item = cookielib.Cookie(
        version=0, name=c, value=str(simple_cookie[c].value),
                 port=None, port_specified=None,
                 domain=cookie_domain, domain_specified=None, domain_initial_dot=None,
                 path=cookie_path, path_specified=None,
                 secure=None,
                 expires=None,
                 discard=None,
                 comment=None,
                 comment_url=None,
                 rest=None,
                 rfc2109=False,
        )
    cookiejar.set_cookie(cookie_item)    # Apply each cookie_item to cookiejar
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))    # Return opener

url=realurl #图片实际地址

savepath='D:\\test.jpg'
f = opener.open(url)
with open(savepath, "wb") as code:      
    code.write(f.read())

后记:

20150718:修正第二类型相册分类数始终为3的错误

20150829:不用再手工删除 --compressed指令


 


你可能感兴趣的:(python,相册,QQ空间)