【记录】Python3|json文件处理相关的操作

文章目录

    • json分割
    • json.gz 转换成 jsonl.gz
    • json格式化显示
    • 遍历目录及子目录,对某种类型的文件内容查找是否有指定字符串

json分割

主要使用json.loads、json.dump。

import json

# 把path文件分割成num个,并存入path2中
# path: 文件路径 path2 : 目标路径 num: 结果的数量
def jsonSplit(path, path2, num):
    count = 0
    for count, line in enumerate(open(path, 'rU', encoding='utf-8')):
        pass
        count += 1
    nums = [(count * i // num) for i in range(1, num + 1)]
    current_lines = 0
    data_list = []
    # 打开大文件,拆成小文件
    with open(path, 'r', encoding='utf-8') as file:
        i = 0
        for line in file:
            # line = line.replace('},','}')
            data_list.append(json.loads(line))
            current_lines += 1
            if current_lines in nums:
                # print(current_lines)
                # 保存文件
                file_name = path2 + str(current_lines) + '.json'
                with open(file_name, 'w', encoding='utf-8') as f:
                    # print(len(data_list))
                    data = json.dumps(data_list)
                    f.write(data)
                    data_list = []
                    data = []

json.gz 转换成 jsonl.gz

主要使用gzip、jsonlines。

import os
import gzip
import jsonlines
import json

# 把path对应的json.gz文件,转化成jsonlines文件,再压缩成jsonl.gz文件
# path:数据集的路径 path2:结果路径
def jsonTojsonlGZ(path, path2):
    with gzip.open(path, 'rt') as pf:
        # 加载json
        data = pf.read()
        all_data = json.loads(data)
        # 打开jsonl并写入
        filename = path2 + '.jsonl'
        with jsonlines.open(filename, mode='a') as writer:
            for item in all_data:
                writer.write(item)
        # 打开jsonl.gz并写入jsonl文件的内容
        f_gzip = gzip.GzipFile(filename + '.gz', "wb")
        with open(filename, 'rb') as f_in:
            f_gzip.write(f_in.read())
        # 删除jsonl文件
        os.remove(filename)

json格式化显示

import json

dic = {'a': 1, 'b': 2, 'c': 3}
js = json.dumps(dic)
print(js)

遍历目录及子目录,对某种类型的文件内容查找是否有指定字符串

# coding:utf-8
import re
import os
import gzip
import sys

file_behind=sys.argv[1]
dirname=sys.argv[2]
tofind=sys.argv[3]

def searchInDir(dirname):
  for root,dirs,files in os.walk(dirname):
    for dir in dirs:
      searchInDir(dir)
    for filename in files:
      if(os.path.splitext(filename)[1]!=file_behind):
        continue
      file=os.path.join(root,filename)
      if(file_behind==".gz"):
        with gzip.open(file,"rt",encoding='utf-8') as f:
          content = f.read()
          print(file, re.findall(tofind,content))
      else:
        with open(file,"rt",encoding='utf-8') as f:
          content = f.read()
          print(file, re.findall(tofind,content))

searchInDir(dirname)

效果:
【记录】Python3|json文件处理相关的操作_第1张图片

你可能感兴趣的:(代码,#,琐碎小记录,python,json)