金融版企业微信聊天记录备份

【关键字】企业微信会话内容存档 聊天记录备份 聊天记录监控

出于监管需求,金融版企业微信聊天记录可以进行备份,腾讯提供了SDK:libWeWorkFinanceSdk_C.so。据说我们是第一个用上金融版企业微信的公司。

chatbackuper.py

#-*- coding: utf8 -*-

import os
import ctypes
import base64
import json
import time
import logging
import psycopg2

from ctypes import *
from Crypto.PublicKey import RSA
from Crypto.Cipher import PKCS1_v1_5
from configparser import ConfigParser


class FinanceSdk:

    def __init__(self, path):
        self.so = ctypes.CDLL(path + 'libWeWorkFinanceSdk_C.so')

    def init_sdk(self, cropid, secret):
        so = self.so
        sdk = so.NewSdk()
        res = so.Init(sdk, cropid.encode('utf8'), secret.encode('utf8'))
        self.sdk = sdk
        return res

    def destory_sdk(self):
        self.so.DestroySdk(self.sdk)

    def get_chat_data(self, seq, limit, proxy, passwd, timeout):
        func = self.so.GetChatData
        func.restype = c_int
        func.argtypes = [c_int, c_long, c_int, c_char_p, c_char_p, c_int, c_int]
        res = None
        proxy = ('' if proxy == None else proxy).encode('utf8')
        passwd = ('' if passwd == None else passwd).encode('utf8')
        slice = self.new_slice()
        if func(self.sdk, seq, limit, proxy, passwd, timeout, slice) == 0:
            res = self.get_content_from_slice(slice)
        self.free_slice(slice)
        return res

    def decrypt_data(self, encrypt_key, private_key, encrypt_msg):
        func = self.so.DecryptData
        func.restype = c_int
        func.argtypes = [c_char_p, c_char_p, c_int]
        res = None
        key = self.rsa_decrypt(private_key, encrypt_key)
        slice = self.new_slice()
        if func(key, encrypt_msg.encode('utf8'), slice) == 0:
            res = self.get_content_from_slice(slice)
        self.free_slice(slice)
        return res

    def get_media_data(self, index_buf, sdk_field, proxy, passwd, timeout,
                       media_data):
        func = self.so.GetMediaData
        func.restype = c_int
        func.argtypes = [c_int, c_char_p, c_char_p, c_char_p, c_char_p,
                         c_int, c_int]
        index_buf = ('' if index_buf == None else index_buf).encode('utf8')
        sdk_field = ('' if sdk_field == None else sdk_field).encode('utf8')
        proxy = ('' if proxy == None else proxy).encode('utf8')
        passwd = ('' if passwd == None else passwd).encode('utf8')
        return func(self.sdk, index_buf, sdk_field, proxy, passwd,
                    timeout, media_data)

    def new_slice(self):
        func = self.so.NewSlice
        func.restype = c_int
        return func()

    def get_content_from_slice(self, slice):
        func = self.so.GetContentFromSlice
        func.restype = c_char_p
        func.argtypes = [c_int]
        return func(slice)

    def free_slice(self, slice):
        func = self.so.FreeSlice
        func.restype = c_void_p
        func.argtypes = [c_int]
        func(slice)

    def new_media_data(self):
        func = self.so.NewMediaData
        func.restype = c_int
        return func()

    def get_data(self, media_data):
        func = self.so.GetData
        func.restype = c_int
        func.argtypes = [c_int]
        ptr = func(media_data)
        size = self.get_data_len(media_data)
        return string_at(ptr, size)

    def get_data_len(self, media_data):
        func = self.so.GetDataLen
        func.restype = c_int
        func.argtypes = [c_int]
        return func(media_data)

    def get_out_index_buf(self, media_data):
        func = self.so.GetOutIndexBuf
        func.restype = c_char_p
        func.argtypes = [c_int]
        return func(media_data)

    def is_media_data_finish(self, media_data):
        func = self.so.IsMediaDataFinish
        func.restype = c_int
        func.argtypes = [c_int]
        return func(media_data)

    def free_media_data(self, media_data):
        func = self.so.FreeMediaData
        func.restype = c_void_p
        func.argtypes = [c_int]
        func(media_data)

    def rsa_decrypt(self, private_key, text):
        pri_key = RSA.import_key(private_key)
        cipher = PKCS1_v1_5.new(pri_key)
        return cipher.decrypt(base64.b64decode(text), None)


class Backuper:

    def __init__(self):
        self.config = self._read_config()
        db = self.config['database']
        self.conn = psycopg2.connect(
                host=db['host'],
                port=db['port'],
                database=db['database'],
                user=db['user'],
                password=db['password'])
        self.logger = logging.getLogger(__name__)
        self.media_types = ['image', 'video', 'voice', 'emotion', 'file']
        self.exts = {'image': 'jpg', 'video': 'mp4', 'voice': 'amr'}

    def backup(self):
        seq = self._get_sequance()
        sdk = FinanceSdk(self.config['backup']['root_path'])
        weixin = self.config['weixin']
        if sdk.init_sdk(weixin['cropid'], weixin['secret']) != 0:
            self.logger.error('SDK init error')
            return

        res = sdk.get_chat_data(seq, weixin['limit'],
                                weixin['proxy_url'], weixin['proxy_password'],
                                weixin['timeout'])
        if res is None:
            self.logger.error('SDK getCharData error')
            return

        chat_datas = json.loads(res)
        data_list = chat_datas['chatdata']
        for item in data_list:
            key = item['encrypt_random_key']
            msg = item['encrypt_chat_msg']
            res = sdk.decrypt_data(key, weixin['private_key'], msg)
            if res is None:
                self.logger.error('SDK decryptData error: ' + str(item['seq']))
            else:
                data = json.loads(res)
                self.logger.info('SDK decryptData: ' +
                        str(item['seq']) + '-' + data['msgtype'])
                self.save_message(data, item['seq'])
                file_path = ''
                if data['msgtype'] in self.media_types:
                    file_path = self.save_media_data(sdk, data)
                self.save_record(data, file_path)

        sdk.destory_sdk()
        try:
            self.conn.commit()
        except Exception as e:
            self.conn.rollback()
            self.logger.error('db error: ' + str(e))
        self.conn.close()

    def save_message(self, data, seq):
        backup = self.config['backup']
        path = backup['path']
        month_path = self._format_timestamp(data['msgtime'], '%Y%m')
        path = path + month_path + '/'
        if not os.path.exists(path):
            os.makedirs(path)
        filename = self._format_timestamp(data['msgtime'], '%Y%m%d') + '.txt'
        file = open(path + filename, 'a')
        data['seq'] = seq
        file.write(json.dumps(data) + '\r\n')
        file.close()

    def save_media_data(self, sdk, data):
        weixin = self.config['weixin']
        msgtype = data['msgtype']
        raw_data = data[msgtype]
        ext = ''
        if 'fileext' in raw_data:
            ext = raw_data['fileext']
        elif msgtype in self.exts:
            ext = self.exts[msgtype]
        elif msgtype == 'emotion':
            ext = ('gif' if raw_data['type'] == 1 else 'png')
        ext = ('' if len(ext) == 0 else ('.' + ext))
        path = self.config['backup']['path']
        month_path = self._format_timestamp(data['msgtime'], '%Y%m')
        path = path + month_path + '/media/'
        if not os.path.exists(path):
            os.makedirs(path)
        filename = msgtype + '_' + data['msgid'] + ext
        if os.path.exists(path + filename):
            os.remove(path + filename)

        file = open(path + filename, 'ab')
        index_buf = ''
        while True:
            media_data = sdk.new_media_data()
            res = sdk.get_media_data(index_buf,
                                     raw_data['sdkfileid'],
                                     weixin['proxy_url'],
                                     weixin['proxy_password'],
                                     weixin['timeout'],
                                     media_data)
            if res != 0:
                sdk.free_media_data(media_data)
                file.close()
                if os.path.exists(path + filename):
                    os.remove(path + filename)
                self.logger.error('SDK getMediaData error: ' + json.dumps(data))
                break
            content = sdk.get_data(media_data)
            file.write(content)
            if sdk.is_media_data_finish(media_data) == 1:
                sdk.free_media_data(media_data)
                file.close()
                break
            else:
                index_buf = str(sdk.get_out_index_buf(media_data),
                                encoding = 'utf8')
                sdk.free_media_data(media_data)
        if os.path.exists(path + filename):
            self.logger.info('media file saved: ' + path + filename)
        return (path + filename)

    def save_record(self, data, file_path):
        cursor = self.conn.cursor()
        backup_path_len = len(self.config['backup']['path'])
        if len(file_path) > backup_path_len:
            file_path = file_path[backup_path_len:]
        try:
            sql = "delete from messages where id='%s'" % (data['msgid'])
            cursor.execute(sql)
            record = {}
            record['id'] = data['msgid']
            record['sequance'] = str(data['seq'])
            record['action'] = data['action']
            record['from_user'] = data['from']
            record['to_user_list'] = ','.join(data['tolist'])
            record['room_id'] = data['roomid'] if 'roomid' in data else ''
            record['message_time'] = self._format_time(data['msgtime'])
            record['message_type'] = data['msgtype']
            record['content'] = ''
            record['raw_data'] = ''
            record['file_path'] = file_path
            record['created_at'] = self._format_time(int(time.time() * 1000))
            if data['msgtype'] in data:
                raw_data = data[data['msgtype']]
                record['raw_data'] = json.dumps(raw_data)
                if data['msgtype'] == 'text':
                    record['content'] = raw_data['content']
            sql = (
                    "insert into messages (id, sequance, action, from_user, "
                    "to_user_list, room_id, message_time, message_type, "
                    "content, raw_data, file_path, created_at) values ("
                    "'%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', "
                    "'%s', '%s', '%s', '%s')"
            )
            sql = sql % (record['id'], record['sequance'],
                    record['action'], record['from_user'],
                    record['to_user_list'], record['room_id'],
                    record['message_time'], record['message_type'],
                    record['content'], record['raw_data'],
                    record['file_path'], record['created_at'])
            cursor.execute(sql)
        except:
            pass
        finally:
            cursor.close()

    def _read_config(self):
        path = os.getcwd()
        if not path.endswith('/'):
            path = path + '/'
        config_file = path + 'config.cfg'
        cp = ConfigParser()
        cp.read(config_file)
        config = {}
        for section in cp.sections():
            config[section] = {}
            for item in cp.items(section):
                config[section][item[0]] = item[1]
        private_key_file = path + config['weixin']['private_key']
        file = open(private_key_file)
        config['weixin']['private_key'] = file.read()
        file.close()
        backup_path = config['backup']['path']
        if not backup_path.startswith('/'):
            backup_path = path + backup_path
        if not backup_path.endswith('/'):
            backup_path = backup_path + '/'
        config['backup']['root_path'] = path
        config['backup']['path'] = backup_path
        config['database']['port'] = int(config['database']['port'])
        config['weixin']['limit'] = int(config['weixin']['limit'])
        config['weixin']['timeout'] = int(config['weixin']['timeout'])
        config['backup']['interval'] = int(config['backup']['interval'])
        return config

    def _get_sequance(self):
        seq = 0
        cursor = self.conn.cursor()
        try:
            sql = 'select max(sequance) from messages'
            cursor.execute(sql)
            rs = cursor.fetchall()
            if len(rs) > 0:
                seq = rs[0][0]
                seq = 0 if seq == None else seq
        except:
            pass
        finally:
            cursor.close()
        return seq

    def _format_timestamp(self, timestamp, format):
        time_array = time.localtime(timestamp / 1000)
        return time.strftime(format, time_array)

    def _format_time(self, timestamp):
        time_array = time.localtime(timestamp / 1000)
        milliseconds = str(timestamp % 1000)
        return time.strftime('%Y-%m-%d %H:%M:%S.', time_array) + milliseconds


if __name__ == '__main__':
    logging.basicConfig(
            filename='out.log', filemode='a', level=logging.INFO,
            format='%(asctime)s [%(levelname)s] %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S')
    backuper = Backuper()
    backuper.backup()
    interval = backuper.config['backup']['interval']
    start_time = time.time()
    while True:
        current_time = time.time()
        if current_time - start_time < interval:
            time.sleep(1)
        else:
            backuper = Backuper()
            backuper.backup()
            start_time = time.time()

我用的是postgresql,就建了一个表。

tables.sql

DROP TABLE IF EXISTS "messages";
CREATE TABLE "messages" (
  "id" varchar(50) PRIMARY KEY NOT NULL,
  "sequance" int8 NOT NULL,
  "action" varchar(20) NOT NULL,
  "from_user" varchar(50) NOT NULL,
  "to_user_list" text NOT NULL,
  "room_id" varchar(50) NOT NULL,
  "message_time" timestamp(6) NOT NULL,
  "message_type" varchar(50) NOT NULL,
  "content" text NOT NULL,
  "raw_data" text NOT NULL,
  "file_path" varchar(200) NOT NULL,
  "created_at" timestamp(6) NOT NULL
);
CREATE INDEX "sequance_idx" ON "messages"("sequance");
CREATE INDEX "action_idx" ON "messages"("action");
CREATE INDEX "from_user_idx" ON "messages"("from_user");
CREATE INDEX "room_id_idx" ON "messages"("room_id");
CREATE INDEX "message_time_idx" ON "messages"("message_time");
CREATE INDEX "message_type_idx" ON "messages"("message_type");

下面是配置文件。

config.cfg

[weixin]
cropid = wwd54b5e1234567890
secret = PO5xxxxxxx-xxxxxxxx_xxxxxxxxx-xxxxxxxxxxxxx
limit = 500
timeout = 60
proxy_url =
proxy_password =
private_key = private_key.pem

[database]
host = localhost
port = 5432
database = chatdata
user = chatdata
password = password

[backup]
interval = 60
path = backup

[server]
host = http://localhost
port = 9001
session_expires = 3600
session_path = sessions
download_path = downloads
sso_client_id = xxxxxxxxxx1234567890
sso_client_secret = xxxxxxxxxx1234567890
sso_authorization_uri = http://test1.xxx.com/oauth/authorize
sso_logout_uri = http://test1.xxx.com/logout
sso_token_uri = http://test1.xxx.com/oauth/token
user_info_uri = http://test1.xxx.com/api/me

libWeWorkFinanceSdk_C.so 就不提供了,20多M呢。


【2020-05-29】补充

关于get_media_data返回值,我手头的文档写的很简略:0成功,非0失败。根据我记录的日志,大概有以下几种情形:
10001/10002:似乎是网络原因,稍后重试一般可以下载成功
10010:消息记录超过3天,无法拉取媒体数据内容
10003:只见过一次,chatrecord类型消息中的媒体文件,看消息时间是近一年以前,也和10010类似?


【2021-01-15】补充

企业微信会话内容存档官方文档

最近又部署了一下这个程序,调用获取会话内容接口(GetChatData)时Python总提示“段错误”,Python版本是3.7.0,后来重新装了3.6.7就好了。

你可能感兴趣的:(金融版企业微信聊天记录备份)