python-多线程数据处理(一)——将dicom数据转化为jpg文件输出头文件

初始化设置

# common utils 
import os
import math
import itertools
import warnings
import numpy as np
import pandas as pd
from collections import Counter
import multiprocessing
import pydicom # 用于读取dcm文件
import glob 
import scipy.misc 
import functools 
# pytorch utils
import torch
import torchvision
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn 
import cv2
from torchvision import models
import torch.nn.functional as F 
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

# others 
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline
plt.rcParams['figure.figsize'] = (3.5, 2.5)
%config InlineBackend.figure_format = 'jpg'
warnings.filterwarnings("ignore")
end = ''
DataPath = '/media/yuansh/14THHD/mimic_cxr/dataset/'
OutPath = '/media/yuansh/14THHD/mimic_cxr/out_path/'

流程

获取所有样本图片路径并保存

# os.listdir(DataPath + '/p15/p15/p15610631/s59150383')
# 
# # 获取所有样本图片路径并保存
# dcm_list = glob.glob(DataPath + '/p*/p*/p*/s*/*.dcm')
# dcm_list = np.array(dcm_list)
# np.save(OutPath+'dcm_list.npy',dcm_list)
dcm_list = list(np.load(OutPath+'dcm_list.npy')) #读取文件
jpg_list = [i.replace('dcm','jpg') for i in dcm_list] # jpg文件路径
file_id = [os.path.split(i)[-1][:-4] for i in dcm_list] # 获取文件名

读取文件并可视化

from skimage.io import imread
# Plot the data 
f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
    img = imread(dcm_list[i])
    ax[i//4, i%4].imshow(img, cmap='gray')
    if i<4:
        ax[i//4, i%4].set_title(str(i))
    else:
        ax[i//4, i%4].set_title(str(i))
    ax[i//4, i%4].axis('off')
    ax[i//4, i%4].set_aspect('auto')
plt.show();

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iOeWzfgL-1619081767426)(output_5_0.jpg)]

- 将dicom文件转化为jpg图片,并提取对应的头文件信息

# 初始化信息字典

@functools.lru_cache(1) # 加快数据的读取速度
def getDicomInfoAndSaveJpg(filename,file_id,out_path):
    dcm = pydicom.read_file(filename)
    jpg_name = filename.replace('dcm','jpg')
    # 获取头文件
    info = {}
    if hasattr(dcm, 'PatientID'):
        info["PatientID"] = dcm.PatientID # 患者ID
    else :
        info["PatientID"] = -999
        
    if hasattr(dcm, 'StudyID'):
        info["StudyID"] = dcm.StudyID # 检查ID
    else :
        info["StudyID"] = -999
        
    if hasattr(dcm, 'StudyDate'):
        info["StudyDate"] = dcm.StudyDate # 检查日期
    else :
        info["StudyDate"] = -999
        
    if hasattr(dcm, 'StudyTime'):
        info["StudyTime"] = dcm.StudyTime # 检查时间
    else :
        info["StudyTime"] = -999
    
    # 获取矩阵信息
    img = dcm.pixel_array
    scipy.misc.imsave(out_path,img)
    # ids = pd.DataFrame.from_dict(info,orient='index',columns=[file_id]).T
    return file_id,info
args_list
# 多进进程处理
p = multiprocessing.Pool(28)
infos = p.starmap(getDicomInfoAndSaveJpg, args_list)
p.close() # 记得关闭
p.join()

- 将所有的jpg文件移另外的一个硬盘中(浪费了点时间)

# if not os.path.exists('/media/yuansh/1THHD/Yuansh_Share/mimic_cxr/jpg'):
#     os.makedirs('/media/yuansh/1THHD/Yuansh_Share/mimic_cxr/jpg')
# import shutil
# args_list = [
#     ('/media/yuansh/1THHD/Yuansh_Share/mimic_cxr/jpg/' + file_id[i] + '.jpg',
#      jpg_list[i]) for i in range(len(jpg_list))
# ]
# 
# def CopyJPG(to_file, jpg_list):
#     shutil.copy(jpg_list, to_file)
#     
#     
# p = multiprocessing.Pool(28)
# p.starmap(CopyJPG, args_list)
# p.close() # 记得关闭
# p.join()

- 将所有的头文件合并然后输出

Name_info = [i[0] for i in infos]
targ_info = [i[1] for i in infos]
PatientID = [i[]for i in targ_info]
StudyID = [i['StudyID']for i in targ_info]
StudyDate = [i['StudyDate']for i in targ_info]
StudyTime = [i['StudyTime']for i in targ_info]


df = pd.DataFrame({'PatientID':PatientID,
              'StudyID':StudyID,
              'StudyDate':StudyDate,
              'StudyTime':StudyTime
             },index=Name_info)
df.to_csv('/media/yuansh/1THHD/Yuansh_Share/mimic_cxr/infos/Sample_infos.csv')

你可能感兴趣的:(生物医学深度学习实战)