python 找出两个Json文件的差异

需求:
比较新旧两个json文件的差异,有三种情况

  • 新的可能会新增
  • 新的可能会减少
  • 未新增也未减少只是修改了其中的某一条数据导致数据发生变化
    基于上面的情况找出差异

数据格式如下:

[
  {
    "ID": 1607,
    "Badge": 7116,
    "CompID": 6330,
    "ReportTo": 254,
    "Remark1": "D47",
    "Remark2": "D2243",
    "Remark3": 1702,
    "Remark4": null,
    "CompID1": null,
    "ReportTo1": 1928,
    "aRemark1": null,
    "aRemark2": null,
    "aRemark3": null,
    "aRemark4": null,
    "BeginDate": "2014-09-16T00:00:00",
    "Reason": null,
    "ClosedTime": "2014-09-16T00:00:00"
  },
  {
    "ID": 5242,
    "Badge": 6970,
    "CompID": 5173,
    "ReportTo": 254,
    "Remark1": "D47",
    "Remark2": "D2243",
    "Remark3": 1702,
    "Remark4": null,
    "CompID1": null,
    "ReportTo1": 1928,
    "aRemark1": null,
    "aRemark2": null,
    "aRemark3": null,
    "aRemark4": null,
    "BeginDate": "2014-09-16T00:00:00",
    "Reason": null,
    "ClosedTime": "2014-09-16T00:00:00"
  }]
import json
import time
#定义装饰器查看不同数据量执行时间
def count_time(func):
    def int_time(*args, **kwargs):
        start_time = time.time()  # 这个是程序开始时间
        func()
        over_time = time.time()  # 这个是程序结束时间
        total_time = over_time - start_time
        print('程序共计%s秒' % total_time)
    return int_time


def diff_set(past_json, latest_json, past_json_num, latest_json_num):
    # 拿到 上一次json的所有id
    past_id_list = [p_json['ID'] for p_json in past_json]
    # 拿到 最新json的所有id
    latest_id_list = [l_json['ID'] for l_json in latest_json]
    # 如果 最新的数量少于上一次id的数量  就是减少了
    diff_set_json = []
    if latest_json_num < past_json_num:
        # 拿到差集 (减少的 ID)
        for diff_id in latest_id_list:
            if diff_id in past_id_list:
                past_id_list.remove(diff_id)
        json_list = past_json
        for p_json in json_list:
            if p_json['ID'] in past_id_list:
                past_json.remove(p_json)
            else:
                diff_set_json.append(p_json['ID'])
        print('当前减少,出现异常, 之后在做处理', past_json)
    # 否 则json添加了新的数据
    else:
        # 拿到差集 (就是新增的)
        for diff_id in past_id_list:
            if diff_id in latest_id_list:
                latest_id_list.remove(diff_id)
        json_list = latest_json
        for l_json in json_list:
            if l_json['ID'] in latest_id_list:
                latest_json.remove(l_json)
            else:
                diff_set_json.append(l_json['ID'])
    # 查找无新增或者删减的 json差异
    change_json = json_diff(past_json, latest_json)

    return diff_set_json + change_json


def json_diff(past_json, latest_json):
    new_json = []
    #这里的数据类型是[{},{},{},......]
    for l_json in latest_json:
        #判断其中一个json文件的数据是否在另一个里面
        if l_json in past_json:
            #如果在的话就把存在的这个删除,这个删除对性能很重要
            #当然也可以不删除,速度会很慢
            past_json.remove(l_json)
        else:
            #如果不在的话说明是有改动了,那么就把ID拿出来存放在列表里面
            new_json.append(l_json['ID'])
    return new_json

@count_time
def query_json():
    # 两个要对比文件的路径
    past_json_obj = open(r'C:\Users\xuan.li\Desktop\generated.json', encoding='UTF-8')  # 设置文件对象
    latest_json_obj = open(r'C:\Users\xuan.li\Desktop\generated -1.json', encoding='UTF-8')  # 设置文件对象
    #json.load() 这个方法很重要,读取出来的是列表套字典的类型,如果使用json.loads()是一行一行的读会很慢而且需要字符串的拼接
    past_json = json.load(past_json_obj)
    latest_json = json.load(latest_json_obj)
    #计算这两个文件的数据的长度
    past_json_num = len(past_json)
    latest_json_num = len(latest_json)

    #如果两个长度相等说明只进行了修改
    if past_json_num == latest_json_num:
        change_uid = json_diff(past_json, latest_json)
    else:
        #如果不相等说明有进行新增或者删除
        change_uid = diff_set(past_json, latest_json, past_json_num, latest_json_num)

    # 发生改变的 id
    print(len(change_uid), change_uid)


if __name__ == '__main__':
    query_json()


结果  
3 [1607, 5242, 985]
程序共计1.01554536819458

你可能感兴趣的:(python,json,python,开发语言)