python: txt与xls代码处理示例

txt与xls代码处理示例

1. 背景

最近因为服务器训练时间不稳定的问题,LZ就想找出具体时间消耗的原因,但是训练平台只能保存成txt,才能下载,于是LZ把所有的数据都保存成txt的形式,当然如果
小伙伴在保存文本时,使用’\t’来保存,这样下载对应的文本后,就可以直接手动复制到excel表格中,这也是一种方式。

2. 环境配置

conda install  xlrd
conda install  xlwt

2. txt2excel示例

主要显示了怎么使用xlwt建立新的workbook,增加sheet,传入数据,并且保存对应的excel

def txt2xls():
    epoch_lists = glob.glob(log_path_root + "*.txt")
    print("the length of epoch lists: ", len(epoch_lists))

    wb = xlwt.Workbook(encoding='ascii')
    total_sheet = wb.add_sheet("total_part")
    for epoch_log_path in epoch_lists:
        f = open(epoch_log_path, 'r')
        # 获取对应的测试名称,并且增加对应sheet
        test_name = epoch_log_path.split("/")[-1].split(".")[0]
        part_sheet = wb.add_sheet(test_name + "_epoch")
        part_sheet.write(0, 0, label='num_epoch')
        part_sheet.write(0, 1, label='data_time(min)')
        part_sheet.write(0, 2, label='batch_time(min)')
        part_sheet.write(0, 3, label='read_img(min)')
        part_sheet.write(0, 4, label='hsv_aug(min)')
        part_sheet.write(0, 5, label='resize_img(min)')
        part_sheet.write(0, 6, label='random_affine(min)')
        part_sheet.write(0, 7, label='contiguousarray(min)')
        part_sheet.write(0, 8, label='transforms(min)')
        part_sheet.write(0, 9, label='det_gt(min)')

        # 获得对应行的数据,并进行分析
        lines = f.readlines()
        row_count = 1
        for line in lines:
            split_list = line.strip().split("|")
            # print(len(split_list))
            # print(split_list[6:-1])
            col_count = 0
            part_sheet.write(row_count, col_count, label=row_count - 1)

            for split_chunk in split_list[6:-1]:
                tmp = split_chunk.split(" ")[2]
                col_count += 1
                part_sheet.write(row_count, col_count, label=tmp)
            row_count += 1
        f.close()

    iter_lists = glob.glob(log_path_root + "*.log")
    print("the length of epoch lists: ", len(iter_lists))
    for iter_log_path in iter_lists:
        f = open(iter_log_path, 'r')
        # 获取对应的测试名称,并且增加对应sheet
        test_name = iter_log_path.split("/")[-1].split(".")[0].split("_")

        print(test_name)
        part_sheet = wb.add_sheet(test_name[-2] + "_" + test_name[-1] + "_iter")
        part_sheet.write(0, 0, label='num_iter')
        part_sheet.write(0, 1, label='read_img(s)')
        part_sheet.write(0, 2, label='hsv_aug(s)')
        part_sheet.write(0, 3, label='resize_img(s)')
        part_sheet.write(0, 4, label='random_affine(s)')
        part_sheet.write(0, 5, label='contiguousarray(s)')
        part_sheet.write(0, 6, label='transforms(s)')
        part_sheet.write(0, 7, label='det_gt(s)')

        # 获得对应行的数据,并进行分析
        lines = f.readlines()
        print("the length of lines: ", len(lines))
        # 在这个里LZ做了一个随机采样,去掉前1000次打迭代,等训练稳定以后,在随机采样时间
        random_select_1000 = random.sample(range(1000, len(lines) - 2), 1002)
        # print(random_select_10000)
        row_count = 1
        for idx in random_select_1000:
            line = lines[idx]
            split_list = line.strip().split("|")
            if len(split_list) < 5:#防止出现一些不符合要求的输出,这个只是做个简单判断
                continue
            # print(len(split_list))
            # print(split_list[6:-1])
            col_count = 0
            label_tmp = "{}(idex:{})".format(row_count - 1, idx)
            part_sheet.write(row_count, col_count, label=label_tmp)

            for split_chunk in split_list[11:]:
                tmp = split_chunk.split(" ")[2][0:-1]
                # print(tmp)
                col_count += 1
                part_sheet.write(row_count, col_count, label=tmp)
            row_count += 1
        f.close()

    wb.save(log_path_root + 'timeconsuming.xls')
    print("Done!")

数据保存形式,epoch的文档格式

epoch: 1 |loss -1.482269 | hm_loss 0.239190 | wh_loss 0.390505 | off_loss 0.110499 | id_loss 0.029421 | data_time 261.601979 | time 384.200000 | read_img 1610.162382 | hsv_aug 1167.812609 | resize_img 29.079759 | random_affine 33.654764 | contiguousarray 9.320365 | transforms 16.844176 | det_gt 19.504466 | 

log的文档格式

test_CPU0_419| train: [1][0/10157]|Tot: 0:00:49 |ETA: 0:00:00 |loss 0.2126 |hm_loss 0.2379 |wh_loss 0.3552 |off_loss 0.1091 |id_loss 0.3125 |Data 34.718s(34.718s) |Net 49.731s | read_img 13.838s | hsv_aug 18.123s | resize_img 0.624s | random_affine 0.662s | contiguousarray 0.068s | transforms 0.152s | det_gt 0.144s

3. 解析excel

主要显示了怎么使用xlrd打开workbook, 读取对应sheet names,获得sheet里面的数据并进行处理

def parseXls():
    workbook = xlrd.open_workbook(log_path_root + 'timeconsuming.xls')
    wb_parse = xlwt.Workbook(encoding='ascii')
    # 配置epoch_parse表头
    epoch_parse = wb_parse.add_sheet("epoch_parse")
    epoch_parse.write(0, 0, label='test_name')
    epoch_parse.write(0, 1, label='data_time(min)')
    epoch_parse.write(0, 2, label='batch_time(min)')
    epoch_parse.write(0, 3, label='read_img(min)')
    epoch_parse.write(0, 4, label='hsv_aug(min)')
    epoch_parse.write(0, 5, label='resize_img(min)')
    epoch_parse.write(0, 6, label='random_affine(min)')
    epoch_parse.write(0, 7, label='contiguousarray(min)')
    epoch_parse.write(0, 8, label='transforms(min)')
    epoch_parse.write(0, 9, label='det_gt(min)')
    # 配置iter_parse表头
    iter_parse = wb_parse.add_sheet("iter_parse")
    iter_parse.write(0, 0, label='test_name')
    iter_parse.write(0, 1, label='read_img(s)')
    iter_parse.write(0, 2, label='hsv_aug(s)')
    iter_parse.write(0, 3, label='resize_img(s)')
    iter_parse.write(0, 4, label='random_affine(s)')
    iter_parse.write(0, 5, label='contiguousarray(s)')
    iter_parse.write(0, 6, label='transforms(s)')
    iter_parse.write(0, 7, label='det_gt(s)')
    print(workbook.sheet_names())
    row_epoch = 1
    row_iter = 1
    for sheet_name in workbook.sheet_names():
        # for sheet_name in ["CPU0_138_iter"]:
        print(sheet_name)

        sheet = workbook.sheet_by_name(sheet_name)  # 获取对应sheetname的数据
        print(sheet.name, sheet.nrows, sheet.ncols)
        if 'epoch' in sheet_name:
            epoch_parse.write(row_epoch, 0, label=sheet_name)
            for col_idx in range(1, sheet.ncols):
                col_values = sheet.col_values(col_idx)[1:]
                tmp = np.array([float(item) for item in col_values])
                print(tmp, np.mean(tmp), np.min(tmp), np.max(tmp))
                # label_new = "{:.02f}(min: {:.02f}, max:{:.02f}, std:{:.02f})".format(np.mean(tmp), np.min(tmp),
                #                                                                      np.max(tmp), np.std(tmp))
                # label_new = "{:.02f}".format(np.mean(tmp))
                epoch_parse.write(row_epoch, col_idx, np.mean(tmp))
            row_epoch += 1
        elif 'iter' in sheet_name:
            iter_parse.write(row_iter, 0, label=sheet_name)
            for col_idx in range(1, sheet.ncols):
                col_values = sheet.col_values(col_idx)[1:]
                tmp = np.array([float(item) for item in col_values])
                print(np.mean(tmp), np.min(tmp), np.max(tmp))
                # label_new = "{:.02f}(min: {:.02f}, max:{:.02f}, std:{:.02f})".format(np.mean(tmp), np.min(tmp),
                #                                                                      np.max(tmp), np.std(tmp))
                # label_new = "{:.02f}".format(np.mean(tmp))
                iter_parse.write(row_iter, col_idx, np.mean(tmp))
            row_iter += 1

    wb_parse.save(log_path_root + 'timeconsuming_parse.xls')
    print("Done!")
    print("XLS file parse successfully!")

PS:小学的课本上,有袁隆平爷爷弯腰研究杂交水稻的照片。2021年5月22日他走了。看到很多公众号,朋友圈里都在缅怀这一位老人,lz也是打心眼里佩服他,一心为了人类的事业,奉献出自己的一生,是行动的巨人,时代的楷模,这才是真正的偶像吧!

你可能感兴趣的:(数据处理,python,python,深度学习,人工智能,机器学习,数据挖掘)