公司有一个微信公众号,每天都会推送同事原创的文章,技术类、文学类、生活类什么都有。虽然微信本身提供了报表功能,但是不满足我们自己的运营需求,需要手工统计下每月的数据,后来我帮忙写了脚本抓取数据
通过抓包工具可以分析出步骤如下
有几个点需要注意下:
# coding:utf-8
import calendar
import datetime
import json
import random
import time
import requests
from PIL import Image
from urllib3.exceptions import InsecureRequestWarning
from wechat.db_util.db_util import insert_appmsg, insert_user
from wechat.login.config import password, username
# 禁用安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
root_url = 'https://mp.weixin.qq.com'
headers = {
'accept-encoding': "gzip, deflate, sdch, br",
'accept-language': "en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4",
'user-agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36",
'accept': "*/*", 'x-requested-with': "XMLHttpRequest", 'connection': "keep-alive",
'cache-control': "no-cache",
'referer': 'https://mp.weixin.qq.com/'
}
client = requests.Session()
def init_login_page():
# 初始化登录页面,获取u_id
client.request("GET", root_url, headers=headers, verify=False)
def submit_pwd():
# 提交登录用户名和密码
url = root_url + '/cgi-bin/bizlogin'
querystring = {"action": "startlogin"}
payload = "username=" + username + "&pwd=" + password + "&imgcode=&f=json&token=&lang=zh_CN&ajax=1"
response = client.request("POST", url, data=payload, headers=headers, params=querystring, verify=False)
content = response.text
return json.loads(content)['redirect_url']
def get_qr_code_page(redirect_url):
redirect_url = root_url + redirect_url
client.request("GET", redirect_url, headers=headers)
def get_qr_code():
qr_code = client.get(root_url + '/cgi-bin/loginqrcode?action=getqrcode')
open('login.jpg', 'wb').write(qr_code.content)
img = Image.open('login.jpg')
img.show()
def check_qr_code_status():
while True:
url = root_url + "/cgi-bin/loginqrcode"
querystring = {"action": "ask", "token": "", "lang": "zh_CN", "f": "json", "ajax": "1"}
response = client.request("GET", url, headers=headers, params=querystring)
content = response.text
print(content)
if json.loads(content)['user_category'] == 3:
break
time.sleep(1)
def final_login():
url = root_url + "/cgi-bin/bizlogin"
querystring = {"action": "login"}
payload = "token=&lang=zh_CN&f=json&ajax=1"
response = client.request("POST", url, data=payload, headers=headers, params=querystring, verify=False)
content = response.text
return json.loads(content)['redirect_url']
def get_home_page(redirect_url):
redirect_url = root_url + redirect_url
client.request("GET", redirect_url, headers=headers, verify=False)
def get_useranalysis(token, begin, end, month):
url = "https://mp.weixin.qq.com/misc/useranalysis"
querystring = {"": "", "begin_date": begin, "end_date": end, "source": "99999999,99999999",
"token": token, "lang": "zh_CN", "f": "json", "ajax": "1", "random": random.uniform(0, 1)}
headers['referer'] = "https://mp.weixin.qq.com/misc/useranalysis?&token=" + token + "&lang=zh_CN"
response = client.request("GET", url, headers=headers, params=querystring)
insert_user(response.text, month)
def get_appmsganalysis(token, begin, end, month):
url = "https://mp.weixin.qq.com/misc/appmsganalysis"
querystring = {"action": "all", "begin_date": begin, "end_date": end, "order_by": "1",
"order_direction": "2", "token": token, "lang": "zh_CN", "f": "json", "ajax": "1",
"random": "0.27315807795869107"}
response = client.request("GET", url, headers=headers, params=querystring)
insert_appmsg(response.text, month)
def domain(month):
begin, end = get_month_first_day_last_day(month)
# 初始化登录页面
init_login_page()
# 提交用户名密码,获取二维码页面url
redirect_url = submit_pwd()
# 跳转二维码页面
get_qr_code_page(redirect_url)
# 下载二维码到本地,并打开
get_qr_code()
# 循环检测二维码状态,直到被扫描
check_qr_code_status()
# 获取登录成功后的页面
redirect_url = final_login()
get_home_page(redirect_url)
# 解析token,以后请求都需要这个token
arr = redirect_url.split('=')
token = arr[len(arr) - 1]
# 获取用户分析数据
get_useranalysis(token, begin, end, month)
# 获取图文分析数据
get_appmsganalysis(token, begin, end, month)
print("done")
def get_month_first_day_last_day(str=None):
arr = str.split('-')
year = int(arr[0])
month = int(arr[1])
# 获取当月第一天的星期和当月的总天数
firstDayWeekDay, monthRange = calendar.monthrange(year, month)
# 获取当月的第一天
firstDay = datetime.date(year=year, month=month, day=1)
lastDay = datetime.date(year=year, month=month, day=monthRange)
return time.strftime('%Y-%m-%d', firstDay.timetuple()), time.strftime('%Y-%m-%d', lastDay.timetuple())
if __name__ == '__main__':
domain("2017-08")
这里面的wechat.login.config.py是配置的账号和密码,就不宜贴出来了,insert操作是把数据插入mongodb里面,这里就不赘述了
其中有很多的请求都是用postman生成的,这里简单介绍下:
比如要用python3抓取百度首页
curl 'https://www.baidu.com/' -H 'Accept-Encoding: gzip, deflate, sdch, br' -H 'Accept-Language: en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Cookie: BAIDUID=36E510C5B8088D855DC1BC020DA0FBCE:FG=1; PSTM=1500784814; BIDUPSID=F29907779469DA92CEA9E9E0A8B1C162; BDUSS=UppSnptcGtpZlJrYjNxelhSa01zcldIMnlQbjNYZWRvVzc0bmN0TkVTeHlSYlpaTVFBQUFBJCQAAAAAAAAAAAEAAADo6~If8LLT8Men0rkzMjcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHK4jllyuI5ZW; H_PS_PSSID=1431_24544_21096_18559_17001_20928; BD_UPN=12314353; sug=3; sugstore=1; ORIGIN=0; bdime=21110; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSVRTM=0' -H 'Connection: keep-alive' --compressed
这样就能抓取公众号后台的数据,然后进一步进行分析和生成报表了,生成报表的过程中还用到了历史文章、作者以及发布时间等信息,之前公众号里面有提供这些数据,后面不知怎么的,没有了,所以我只好换其他方式进行抓取了,下次再介绍吧