【关键字】企业微信会话内容存档 聊天记录备份 聊天记录监控
出于监管需求,金融版企业微信聊天记录可以进行备份,腾讯提供了SDK:libWeWorkFinanceSdk_C.so。据说我们是第一个用上金融版企业微信的公司。
chatbackuper.py
#-*- coding: utf8 -*-
import os
import ctypes
import base64
import json
import time
import logging
import psycopg2
from ctypes import *
from Crypto.PublicKey import RSA
from Crypto.Cipher import PKCS1_v1_5
from configparser import ConfigParser
class FinanceSdk:
def __init__(self, path):
self.so = ctypes.CDLL(path + 'libWeWorkFinanceSdk_C.so')
def init_sdk(self, cropid, secret):
so = self.so
sdk = so.NewSdk()
res = so.Init(sdk, cropid.encode('utf8'), secret.encode('utf8'))
self.sdk = sdk
return res
def destory_sdk(self):
self.so.DestroySdk(self.sdk)
def get_chat_data(self, seq, limit, proxy, passwd, timeout):
func = self.so.GetChatData
func.restype = c_int
func.argtypes = [c_int, c_long, c_int, c_char_p, c_char_p, c_int, c_int]
res = None
proxy = ('' if proxy == None else proxy).encode('utf8')
passwd = ('' if passwd == None else passwd).encode('utf8')
slice = self.new_slice()
if func(self.sdk, seq, limit, proxy, passwd, timeout, slice) == 0:
res = self.get_content_from_slice(slice)
self.free_slice(slice)
return res
def decrypt_data(self, encrypt_key, private_key, encrypt_msg):
func = self.so.DecryptData
func.restype = c_int
func.argtypes = [c_char_p, c_char_p, c_int]
res = None
key = self.rsa_decrypt(private_key, encrypt_key)
slice = self.new_slice()
if func(key, encrypt_msg.encode('utf8'), slice) == 0:
res = self.get_content_from_slice(slice)
self.free_slice(slice)
return res
def get_media_data(self, index_buf, sdk_field, proxy, passwd, timeout,
media_data):
func = self.so.GetMediaData
func.restype = c_int
func.argtypes = [c_int, c_char_p, c_char_p, c_char_p, c_char_p,
c_int, c_int]
index_buf = ('' if index_buf == None else index_buf).encode('utf8')
sdk_field = ('' if sdk_field == None else sdk_field).encode('utf8')
proxy = ('' if proxy == None else proxy).encode('utf8')
passwd = ('' if passwd == None else passwd).encode('utf8')
return func(self.sdk, index_buf, sdk_field, proxy, passwd,
timeout, media_data)
def new_slice(self):
func = self.so.NewSlice
func.restype = c_int
return func()
def get_content_from_slice(self, slice):
func = self.so.GetContentFromSlice
func.restype = c_char_p
func.argtypes = [c_int]
return func(slice)
def free_slice(self, slice):
func = self.so.FreeSlice
func.restype = c_void_p
func.argtypes = [c_int]
func(slice)
def new_media_data(self):
func = self.so.NewMediaData
func.restype = c_int
return func()
def get_data(self, media_data):
func = self.so.GetData
func.restype = c_int
func.argtypes = [c_int]
ptr = func(media_data)
size = self.get_data_len(media_data)
return string_at(ptr, size)
def get_data_len(self, media_data):
func = self.so.GetDataLen
func.restype = c_int
func.argtypes = [c_int]
return func(media_data)
def get_out_index_buf(self, media_data):
func = self.so.GetOutIndexBuf
func.restype = c_char_p
func.argtypes = [c_int]
return func(media_data)
def is_media_data_finish(self, media_data):
func = self.so.IsMediaDataFinish
func.restype = c_int
func.argtypes = [c_int]
return func(media_data)
def free_media_data(self, media_data):
func = self.so.FreeMediaData
func.restype = c_void_p
func.argtypes = [c_int]
func(media_data)
def rsa_decrypt(self, private_key, text):
pri_key = RSA.import_key(private_key)
cipher = PKCS1_v1_5.new(pri_key)
return cipher.decrypt(base64.b64decode(text), None)
class Backuper:
def __init__(self):
self.config = self._read_config()
db = self.config['database']
self.conn = psycopg2.connect(
host=db['host'],
port=db['port'],
database=db['database'],
user=db['user'],
password=db['password'])
self.logger = logging.getLogger(__name__)
self.media_types = ['image', 'video', 'voice', 'emotion', 'file']
self.exts = {'image': 'jpg', 'video': 'mp4', 'voice': 'amr'}
def backup(self):
seq = self._get_sequance()
sdk = FinanceSdk(self.config['backup']['root_path'])
weixin = self.config['weixin']
if sdk.init_sdk(weixin['cropid'], weixin['secret']) != 0:
self.logger.error('SDK init error')
return
res = sdk.get_chat_data(seq, weixin['limit'],
weixin['proxy_url'], weixin['proxy_password'],
weixin['timeout'])
if res is None:
self.logger.error('SDK getCharData error')
return
chat_datas = json.loads(res)
data_list = chat_datas['chatdata']
for item in data_list:
key = item['encrypt_random_key']
msg = item['encrypt_chat_msg']
res = sdk.decrypt_data(key, weixin['private_key'], msg)
if res is None:
self.logger.error('SDK decryptData error: ' + str(item['seq']))
else:
data = json.loads(res)
self.logger.info('SDK decryptData: ' +
str(item['seq']) + '-' + data['msgtype'])
self.save_message(data, item['seq'])
file_path = ''
if data['msgtype'] in self.media_types:
file_path = self.save_media_data(sdk, data)
self.save_record(data, file_path)
sdk.destory_sdk()
try:
self.conn.commit()
except Exception as e:
self.conn.rollback()
self.logger.error('db error: ' + str(e))
self.conn.close()
def save_message(self, data, seq):
backup = self.config['backup']
path = backup['path']
month_path = self._format_timestamp(data['msgtime'], '%Y%m')
path = path + month_path + '/'
if not os.path.exists(path):
os.makedirs(path)
filename = self._format_timestamp(data['msgtime'], '%Y%m%d') + '.txt'
file = open(path + filename, 'a')
data['seq'] = seq
file.write(json.dumps(data) + '\r\n')
file.close()
def save_media_data(self, sdk, data):
weixin = self.config['weixin']
msgtype = data['msgtype']
raw_data = data[msgtype]
ext = ''
if 'fileext' in raw_data:
ext = raw_data['fileext']
elif msgtype in self.exts:
ext = self.exts[msgtype]
elif msgtype == 'emotion':
ext = ('gif' if raw_data['type'] == 1 else 'png')
ext = ('' if len(ext) == 0 else ('.' + ext))
path = self.config['backup']['path']
month_path = self._format_timestamp(data['msgtime'], '%Y%m')
path = path + month_path + '/media/'
if not os.path.exists(path):
os.makedirs(path)
filename = msgtype + '_' + data['msgid'] + ext
if os.path.exists(path + filename):
os.remove(path + filename)
file = open(path + filename, 'ab')
index_buf = ''
while True:
media_data = sdk.new_media_data()
res = sdk.get_media_data(index_buf,
raw_data['sdkfileid'],
weixin['proxy_url'],
weixin['proxy_password'],
weixin['timeout'],
media_data)
if res != 0:
sdk.free_media_data(media_data)
file.close()
if os.path.exists(path + filename):
os.remove(path + filename)
self.logger.error('SDK getMediaData error: ' + json.dumps(data))
break
content = sdk.get_data(media_data)
file.write(content)
if sdk.is_media_data_finish(media_data) == 1:
sdk.free_media_data(media_data)
file.close()
break
else:
index_buf = str(sdk.get_out_index_buf(media_data),
encoding = 'utf8')
sdk.free_media_data(media_data)
if os.path.exists(path + filename):
self.logger.info('media file saved: ' + path + filename)
return (path + filename)
def save_record(self, data, file_path):
cursor = self.conn.cursor()
backup_path_len = len(self.config['backup']['path'])
if len(file_path) > backup_path_len:
file_path = file_path[backup_path_len:]
try:
sql = "delete from messages where id='%s'" % (data['msgid'])
cursor.execute(sql)
record = {}
record['id'] = data['msgid']
record['sequance'] = str(data['seq'])
record['action'] = data['action']
record['from_user'] = data['from']
record['to_user_list'] = ','.join(data['tolist'])
record['room_id'] = data['roomid'] if 'roomid' in data else ''
record['message_time'] = self._format_time(data['msgtime'])
record['message_type'] = data['msgtype']
record['content'] = ''
record['raw_data'] = ''
record['file_path'] = file_path
record['created_at'] = self._format_time(int(time.time() * 1000))
if data['msgtype'] in data:
raw_data = data[data['msgtype']]
record['raw_data'] = json.dumps(raw_data)
if data['msgtype'] == 'text':
record['content'] = raw_data['content']
sql = (
"insert into messages (id, sequance, action, from_user, "
"to_user_list, room_id, message_time, message_type, "
"content, raw_data, file_path, created_at) values ("
"'%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', "
"'%s', '%s', '%s', '%s')"
)
sql = sql % (record['id'], record['sequance'],
record['action'], record['from_user'],
record['to_user_list'], record['room_id'],
record['message_time'], record['message_type'],
record['content'], record['raw_data'],
record['file_path'], record['created_at'])
cursor.execute(sql)
except:
pass
finally:
cursor.close()
def _read_config(self):
path = os.getcwd()
if not path.endswith('/'):
path = path + '/'
config_file = path + 'config.cfg'
cp = ConfigParser()
cp.read(config_file)
config = {}
for section in cp.sections():
config[section] = {}
for item in cp.items(section):
config[section][item[0]] = item[1]
private_key_file = path + config['weixin']['private_key']
file = open(private_key_file)
config['weixin']['private_key'] = file.read()
file.close()
backup_path = config['backup']['path']
if not backup_path.startswith('/'):
backup_path = path + backup_path
if not backup_path.endswith('/'):
backup_path = backup_path + '/'
config['backup']['root_path'] = path
config['backup']['path'] = backup_path
config['database']['port'] = int(config['database']['port'])
config['weixin']['limit'] = int(config['weixin']['limit'])
config['weixin']['timeout'] = int(config['weixin']['timeout'])
config['backup']['interval'] = int(config['backup']['interval'])
return config
def _get_sequance(self):
seq = 0
cursor = self.conn.cursor()
try:
sql = 'select max(sequance) from messages'
cursor.execute(sql)
rs = cursor.fetchall()
if len(rs) > 0:
seq = rs[0][0]
seq = 0 if seq == None else seq
except:
pass
finally:
cursor.close()
return seq
def _format_timestamp(self, timestamp, format):
time_array = time.localtime(timestamp / 1000)
return time.strftime(format, time_array)
def _format_time(self, timestamp):
time_array = time.localtime(timestamp / 1000)
milliseconds = str(timestamp % 1000)
return time.strftime('%Y-%m-%d %H:%M:%S.', time_array) + milliseconds
if __name__ == '__main__':
logging.basicConfig(
filename='out.log', filemode='a', level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
backuper = Backuper()
backuper.backup()
interval = backuper.config['backup']['interval']
start_time = time.time()
while True:
current_time = time.time()
if current_time - start_time < interval:
time.sleep(1)
else:
backuper = Backuper()
backuper.backup()
start_time = time.time()
我用的是postgresql,就建了一个表。
tables.sql
DROP TABLE IF EXISTS "messages";
CREATE TABLE "messages" (
"id" varchar(50) PRIMARY KEY NOT NULL,
"sequance" int8 NOT NULL,
"action" varchar(20) NOT NULL,
"from_user" varchar(50) NOT NULL,
"to_user_list" text NOT NULL,
"room_id" varchar(50) NOT NULL,
"message_time" timestamp(6) NOT NULL,
"message_type" varchar(50) NOT NULL,
"content" text NOT NULL,
"raw_data" text NOT NULL,
"file_path" varchar(200) NOT NULL,
"created_at" timestamp(6) NOT NULL
);
CREATE INDEX "sequance_idx" ON "messages"("sequance");
CREATE INDEX "action_idx" ON "messages"("action");
CREATE INDEX "from_user_idx" ON "messages"("from_user");
CREATE INDEX "room_id_idx" ON "messages"("room_id");
CREATE INDEX "message_time_idx" ON "messages"("message_time");
CREATE INDEX "message_type_idx" ON "messages"("message_type");
下面是配置文件。
config.cfg
[weixin]
cropid = wwd54b5e1234567890
secret = PO5xxxxxxx-xxxxxxxx_xxxxxxxxx-xxxxxxxxxxxxx
limit = 500
timeout = 60
proxy_url =
proxy_password =
private_key = private_key.pem
[database]
host = localhost
port = 5432
database = chatdata
user = chatdata
password = password
[backup]
interval = 60
path = backup
[server]
host = http://localhost
port = 9001
session_expires = 3600
session_path = sessions
download_path = downloads
sso_client_id = xxxxxxxxxx1234567890
sso_client_secret = xxxxxxxxxx1234567890
sso_authorization_uri = http://test1.xxx.com/oauth/authorize
sso_logout_uri = http://test1.xxx.com/logout
sso_token_uri = http://test1.xxx.com/oauth/token
user_info_uri = http://test1.xxx.com/api/me
libWeWorkFinanceSdk_C.so 就不提供了,20多M呢。
【2020-05-29】补充
关于get_media_data返回值,我手头的文档写的很简略:0成功,非0失败。根据我记录的日志,大概有以下几种情形:
10001/10002:似乎是网络原因,稍后重试一般可以下载成功
10010:消息记录超过3天,无法拉取媒体数据内容
10003:只见过一次,chatrecord类型消息中的媒体文件,看消息时间是近一年以前,也和10010类似?
【2021-01-15】补充
企业微信会话内容存档官方文档
最近又部署了一下这个程序,调用获取会话内容接口(GetChatData)时Python总提示“段错误”,Python版本是3.7.0,后来重新装了3.6.7就好了。