需求:
比较新旧两个json文件的差异,有三种情况
数据格式如下:
[
{
"ID": 1607,
"Badge": 7116,
"CompID": 6330,
"ReportTo": 254,
"Remark1": "D47",
"Remark2": "D2243",
"Remark3": 1702,
"Remark4": null,
"CompID1": null,
"ReportTo1": 1928,
"aRemark1": null,
"aRemark2": null,
"aRemark3": null,
"aRemark4": null,
"BeginDate": "2014-09-16T00:00:00",
"Reason": null,
"ClosedTime": "2014-09-16T00:00:00"
},
{
"ID": 5242,
"Badge": 6970,
"CompID": 5173,
"ReportTo": 254,
"Remark1": "D47",
"Remark2": "D2243",
"Remark3": 1702,
"Remark4": null,
"CompID1": null,
"ReportTo1": 1928,
"aRemark1": null,
"aRemark2": null,
"aRemark3": null,
"aRemark4": null,
"BeginDate": "2014-09-16T00:00:00",
"Reason": null,
"ClosedTime": "2014-09-16T00:00:00"
}]
import json
import time
#定义装饰器查看不同数据量执行时间
def count_time(func):
def int_time(*args, **kwargs):
start_time = time.time() # 这个是程序开始时间
func()
over_time = time.time() # 这个是程序结束时间
total_time = over_time - start_time
print('程序共计%s秒' % total_time)
return int_time
def diff_set(past_json, latest_json, past_json_num, latest_json_num):
# 拿到 上一次json的所有id
past_id_list = [p_json['ID'] for p_json in past_json]
# 拿到 最新json的所有id
latest_id_list = [l_json['ID'] for l_json in latest_json]
# 如果 最新的数量少于上一次id的数量 就是减少了
diff_set_json = []
if latest_json_num < past_json_num:
# 拿到差集 (减少的 ID)
for diff_id in latest_id_list:
if diff_id in past_id_list:
past_id_list.remove(diff_id)
json_list = past_json
for p_json in json_list:
if p_json['ID'] in past_id_list:
past_json.remove(p_json)
else:
diff_set_json.append(p_json['ID'])
print('当前减少,出现异常, 之后在做处理', past_json)
# 否 则json添加了新的数据
else:
# 拿到差集 (就是新增的)
for diff_id in past_id_list:
if diff_id in latest_id_list:
latest_id_list.remove(diff_id)
json_list = latest_json
for l_json in json_list:
if l_json['ID'] in latest_id_list:
latest_json.remove(l_json)
else:
diff_set_json.append(l_json['ID'])
# 查找无新增或者删减的 json差异
change_json = json_diff(past_json, latest_json)
return diff_set_json + change_json
def json_diff(past_json, latest_json):
new_json = []
#这里的数据类型是[{},{},{},......]
for l_json in latest_json:
#判断其中一个json文件的数据是否在另一个里面
if l_json in past_json:
#如果在的话就把存在的这个删除,这个删除对性能很重要
#当然也可以不删除,速度会很慢
past_json.remove(l_json)
else:
#如果不在的话说明是有改动了,那么就把ID拿出来存放在列表里面
new_json.append(l_json['ID'])
return new_json
@count_time
def query_json():
# 两个要对比文件的路径
past_json_obj = open(r'C:\Users\xuan.li\Desktop\generated.json', encoding='UTF-8') # 设置文件对象
latest_json_obj = open(r'C:\Users\xuan.li\Desktop\generated -1.json', encoding='UTF-8') # 设置文件对象
#json.load() 这个方法很重要,读取出来的是列表套字典的类型,如果使用json.loads()是一行一行的读会很慢而且需要字符串的拼接
past_json = json.load(past_json_obj)
latest_json = json.load(latest_json_obj)
#计算这两个文件的数据的长度
past_json_num = len(past_json)
latest_json_num = len(latest_json)
#如果两个长度相等说明只进行了修改
if past_json_num == latest_json_num:
change_uid = json_diff(past_json, latest_json)
else:
#如果不相等说明有进行新增或者删除
change_uid = diff_set(past_json, latest_json, past_json_num, latest_json_num)
# 发生改变的 id
print(len(change_uid), change_uid)
if __name__ == '__main__':
query_json()
结果
3 [1607, 5242, 985]
程序共计1.01554536819458秒