本文前传:https://blog.csdn.net/gengli2017/article/details/81940707
要根据QQ坦白说提取信息,可以通过爬虫获取好友信息,下面对获取信息过程进行讲解
0. 本文用的是Python3.7,selenium3.14.0
1. selenium是什么?selenium是一个python库,可以很方便抓取网页,此处用来自动登录QQ和抓取网页
2. gtk和g_qzonetoken是什么?可以简单理解为定位好友信息的网址的一部分,就用来定位好友信息
3. gtk和g_qzonetoken怎样获取?gtk通过浏览器cookie获取,g_zonetoken可以通过解析网页获取。浏览器cookie就是浏览器缓存。更详细的见:https://blog.csdn.net/vision_tung/article/details/78888639 ,https://blog.csdn.net/Vision_Tung/article/details/78888695。
4. gkt和g_qzonetoken每次登录都会不同
5. 我的好友列表网址,myQQ是你的qq号,其他对应换https://user.qzone.qq.com/proxy/domain/r.qzone.qq.com/cgi-bin/tfriend/friend_hat_get.cgi?hat_seed=1&uin=’ + str(myQQ) +’&fupdate=1&g_tk=’+str(gtk)+’&qzonetoken=’+str(g_qzonetoken)+’&g_tk=’+str(gtk)
6. 好友信息网址https://mobile.qzone.qq.com/profile_get?qzonetoken=‘+str(g_qzonetoken) + ‘&g_tk=’+str(gtk)+’&format=json&hostuin=’ + str(friendQQ)
7. 自己和好友认识天数信息:https://user.qzone.qq.com/proxy/domain/r.qzone.qq.com/cgi-bin/friendship/cgi_friendship?activeuin=’ + str(myQQ) +’&passiveuin=’ + str(friendQQ) +’&situation=1&isCalendar=1&g_tk=’+str(gtk) + ‘&qzonetoken=’+str(g_qzonetoken)+’&g_tk=’+str(gtk)
里面只提供开始认识的unix时间”addFriendTime”和现在系统时间”systemTime”:1534816754,两者之差就是认识时间差,怎么转换成天数、年数等见代码
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re
import datetime
def frankSpeak(account, password) :
myQQ = account
driver = loginQQ(account, password)
gtk, g_qzonetoken = getGtk_Token(driver)
friendsDict = getFriends(driver, myQQ, gtk, g_qzonetoken)
i = 1
friendInfoVector = []
for friendQQ in friendsDict :
i += 1
(age, city, constellation, gender, province) = getFriendInfo(driver, friendQQ, gtk, g_qzonetoken)
knownTime = getKnownTime(driver, myQQ, friendQQ, gtk, g_qzonetoken)
friendInfo = (friendQQ, friendsDict[friendQQ], age, gender, constellation, knownTime, province, city)
#(age,city,constellation,gender,province)
#male:gender=1, famale:gender=0, else:gender=2
friendInfoVector.append(friendInfo)
#to aviod your QQ locked by Tenser, find 5 friends infomation then reload
#此处比较尴尬,频繁访问腾讯信息被封号两小时,所以没抓取五个好友信息就等两分钟
if(i % 5 == 0) :
time.sleep(120)
driver = loginQQ(account, password)
gtk, g_qzonetoken = getGtk_Token(driver)
print('所有好友信息')
print('(QQ号, 备注, 年龄, 性别, 星座, 认识时间, 省份, 城市)')
print('女:性别=0 男:性别=1 未注明:性别=2')
for i in range(len(friendInfoVector)) :
print(friendInfoVector[i])
# print(gtk)
# print(g_qzonetoken)
# print(friendsDick)
#登录QQ,获取QQ页面
def loginQQ(account, password) :
chrome_options = Options()
chrome_options.add_argument("--disable-infobars")
driver = webdriver.Chrome() #这个是chormedriver的地址
driver.get('https://qzone.qq.com/')
driver.switch_to.frame('login_frame')
driver.find_element_by_id('switcher_plogin').click()
driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys(account)
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys(password)
driver.find_element_by_id('login_button').click()
time.sleep(2)
return driver
#从Cookie获取GTK
def getGTKFromCookie(cookie):
hashes = 5381
for letter in cookie['p_skey']:
hashes += (hashes << 5) + ord(letter)
return hashes & 0x7fffffff
#获取gtk和g_qzonetoken
def getGtk_Token(driver) :
cookie = {} #初始化cookie字典
for elem in driver.get_cookies(): #取cookies
cookie[elem['name']] = elem['value']
gtk = getGTKFromCookie(cookie)
html = driver.page_source
g_qzonetoken=re.search('window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)',html)
#从网页源码中提取g_qzonetoken
g_qzonetoken = str(g_qzonetoken[0]).split('\"')[1]
return gtk, g_qzonetoken
#获得好友列表
def getFriends(driver, myQQ, gtk, g_qzonetoken) :
friendUrl = 'https://user.qzone.qq.com/proxy/domain/r.qzone.qq.com/cgi-bin/tfriend/friend_hat_get.cgi?hat_seed=1&uin=' + str(myQQ) +'&fupdate=1&g_tk='+str(gtk)+'&qzonetoken='+str(g_qzonetoken)+'&g_tk='+str(gtk)
driver.get(friendUrl)
friend_list = driver.page_source
friend_list = str(friend_list )
pattern = re.compile('\"(.\d*)\":\{\\n"realname":"(.*?)"}',re.S)
QQ_name_list = re.findall(pattern, str(friend_list))
friendDick=dict() #numList => (QQnum:QQname)
for friend in QQ_name_list:
friendDick[str(friend[0])]=str(friend[1])
return friendDick
#通过好友QQ号获取好友信息
def getFriendInfo(driver, friendQQ, gtk, g_qzonetoken) :
friendInfoUrl = 'https://mobile.qzone.qq.com/profile_get?qzonetoken='+str(g_qzonetoken) + '&g_tk='+str(gtk)+'&format=json&hostuin=' + str(friendQQ)
driver.get(friendInfoUrl)
friendInfo = driver.page_source
friendInfo = str(friendInfo)
#savefile(str(friendQQ)+'txt', friendInfo)
pattern = re.compile( r'"age":(\d*).*"city":"(\w*)".*"constellation":"(\w*).*"gender":(-?\d*).*"province":"(\w*)"')
usefulInfo = pattern.findall(friendInfo)
if (any(usefulInfo)) :
return usefulInfo[0]
else :
return ('-1','NULL','NULL','2','NULL')
#usefulInfo=[(age,city,constellation,gender,province)]
#usefulInfo[0]=(age,city,constellation,gender,province)
#male:gender=1, famale:gender=0, else:gender=2
#通过自己QQ和好友QQ获取认识的时间
def getKnownTime(driver, myQQ, friendQQ, gtk, g_qzonetoken) :
knownDaysUrl = 'https://user.qzone.qq.com/proxy/domain/r.qzone.qq.com/cgi-bin/friendship/cgi_friendship?activeuin=' + str(myQQ) +'&passiveuin=' + str(friendQQ) +'&situation=1&isCalendar=1&g_tk='+str(gtk) + '&qzonetoken='+str(g_qzonetoken)+'&g_tk='+str(gtk)
driver.get(knownDaysUrl)
knownDaysInfo = driver.page_source
knownDaysInfo = str(knownDaysInfo)
beginStamp = re.search(r'"addFriendTime":(\d+)', knownDaysInfo)
beginStamp = str(beginStamp.group(1))
beginTime = datetime.date.fromtimestamp(int(beginStamp))
beginY = beginTime.year
beginM = beginTime.month
beginD = beginTime.day
lastStamp = re.search(r'"systemTime":(\d+)', knownDaysInfo)
lastStamp = str(lastStamp.group(1))
lastTime = datetime.date.fromtimestamp(int(lastStamp))
lastY = lastTime.year
lastM = lastTime.month
lastD = lastTime.day
if(lastY > beginY) :
return (str(lastY - beginY + 1) + '年')
elif(lastM > lastM) :
return (str(lastM - beginM + 1) + '月')
else :
return (str(lastD - beginD + 1) + '日')
#运行程序,把QQNumber换成要登录QQ, password换成密码
frankSpeak('QQNumber', 'password')