python 多线程数据处理——处理图片获取时间,提取正面xray图片

初始化设置

# common utils 
import os
import math
import itertools
import warnings
import numpy as np
import pandas as pd
from collections import Counter
import multiprocessing
import pydicom # 用于读取dcm文件
import glob 
import scipy.misc 
import functools 
# pytorch utils
import torch
import torchvision
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn 
import cv2
from torchvision import models
import torch.nn.functional as F 
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from skimage.io import imread

# others 
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline
plt.rcParams['figure.figsize'] = (3.5, 2.5)
%config InlineBackend.figure_format = 'svg'
warnings.filterwarnings("ignore")
end = ''
DataPath = '/media/yuansh/14THHD/mimic_cxr/dataset/'
OutPath = '/media/yuansh/14THHD/mimic_cxr/out_path/'
info_dir = '/media/yuansh/14THHD/1-MIMIC/dataset/mimic-iv-0.4'

# 读取文件
dcm_list = list(np.load(OutPath+'dcm_list.npy')) #所有dicom数据的路径
jpg_list = [i.replace('dcm','jpg') for i in dcm_list] # 所有jpg数据的路径
file_id = [os.path.split(i)[-1][:-4] for i in dcm_list] # 所有文件的前缀
img_info = pd.read_csv(OutPath + 'Sample_infos.csv',index_col=0) # imgs的头文件
sepsis_info = pd.read_csv(info_dir + '/meta-info/mimic_iv_sepsis3.csv') # 脓毒症信息
shock_info = pd.read_csv(info_dir + '/meta-info/mimic_iv_septic_shock.csv') # 脓毒症休克信息
use_imgs = img_info[img_info.PatientID.isin(sepsis_info.subject_id)] # 浓度镇且有图片的信息
sepsis_imgs = list(np.load(OutPath+'imgs_sepsis.npy')) #脓毒症数据的路径

# 对use_imgs中的studyID改为字符串
use_imgs['StudyTime'] = use_imgs['StudyTime'].astype(str)
dcm_list = list(np.load(OutPath+'dcm_list.npy')) #所有dicom数据的路径
jpg_list = [i.replace('dcm','jpg') for i in dcm_list] # 所有jpg数据的路径
file_id = [os.path.split(i)[-1][:-4] for i in dcm_list] # 所有文件的前缀
img_info = pd.read_csv(OutPath + 'Sample_infos.csv',index_col=0) # imgs的头文件
sepsis_info = pd.read_csv(info_dir + '/meta-info/mimic_iv_sepsis3.csv') # 脓毒症信息
shock_info = pd.read_csv(info_dir + '/meta-info/mimic_iv_septic_shock.csv') # 脓毒症休克信息
use_imgs = img_info[img_info.PatientID.isin(sepsis_info.subject_id)] # 浓度镇且有图片的信息
sepsis_imgs = list(np.load(OutPath+'imgs_sepsis.npy')) #脓毒症数据的路径
use_imgs['StudyTime'] = use_imgs['StudyTime'].astype(str)

dcm_list : Dicom文件的具体路径

jpg_list : jpg文件的具体路径

fileid :dicom或者jpg文件的前缀

img_info : dicom文件的头文件信息

seosis_info :脓毒症患者信息

shock_info :休克患者信息的studyid以及休克时间

use_imgs : 脓毒症患者图片

sepsis_imgs:脓毒症患者图片路径

print("脓毒症患者数量: ",len(set(sepsis_info.subject_id)))
print("脓毒症患者图片数量: ",len(sepsis_imgs))
print("有图片的脓毒症患者数量: ",len(set(use_imgs.PatientID)))
脓毒症患者数量:  10375
脓毒症患者图片数量:  77677
有图片的脓毒症患者数量:  4929
  • 定义函数
# 数据增强
def DataTransforms(phase=None):
    if phase == 'train':
        data = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=(224, 224)),
            torchvision.transforms.RandomRotation(degrees=(-20, +20)),
            torchvision.transforms.ColorJitter((1.2, 1.5)),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize([0.485, 0.456, 0.406],
                                             [0.229, 0.224, 0.225])
        ])

    elif phase == 'test' or phase == 'val':
        data = torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=(224, 224)),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize([0.485, 0.456, 0.406],
                                             [0.229, 0.224, 0.225])
        ])

    return data
  • 数据可视化
# Plot the data 
f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
    img = imread(jpg_list[i])
    ax[i//4, i%4].imshow(img, cmap='gray')
    if i<4:
        ax[i//4, i%4].set_title(str(i))
    else:
        ax[i//4, i%4].set_title(str(i))
    ax[i//4, i%4].axis('off')
    ax[i//4, i%4].set_aspect('auto')
plt.show();

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-V6syrF6I-1619081821173)(output_9_0.svg)]

# 所有的脓毒镇患者
# sample_info.head()
# 脓毒针休克
# sample_info[sample_info.stay_id.isin(shock_info.stay_id)].head()
# 脓毒针有图片的
# sample_info[sample_info.subject_id.isin(img_info.PatientID)].head()
  • 提取数据

脓毒症且有图片的数据

# imgs_index = list(use_imgs.index)
# 
# def get_imgs(i):
#     if pd.Series(os.path.split(i)[-1][:-4]).isin(imgs_index).all():
#         return i
# 
# # 多进进程处理
# p = multiprocessing.Pool(24)
# imgs_list = p.map(get_imgs, dcm_list)
# p.close() # 记得关闭
# p.join()
# imgs_list = list(filter(None, imgs_list))
# np.save( OutPath+'imgs_sepsis.npy',imgs_list)
  • StudyTime

193516.234 代表 19:35:16
44401.953 代表 4:44:01

# 对头文件中的studytime 进行修改一下
check_times = np.array([str(i).split('.')[0] for i in use_imgs.StudyTime])
use_imgs['StudyTime'] = check_times

check_times = []
for i in list(use_imgs.StudyTime):
    if len(i) < 2:
        i = '00000' + i
        check_times.append(i)
    elif len(i) < 3:
        i = '0000' + i
        check_times.append(i)
    elif len(i) < 4:
        i = '000' + i
        check_times.append(i)
    elif len(i) < 5:
        i = '00' + i
        check_times.append(i)
    elif len(i) < 6:
        i = '0' + i
        check_times.append(i)
    else :
        check_times.append(i)
        
ids = [len(i) ==6 for i in check_times]
secend = [int(i[-2:]) for i in check_times]
minut = [int(i[-4:-2]) for i in check_times]
hour = [int(i[:-4]) for i in check_times]

max(secend)
max(minut)
max(hour)
59






59






23
  • 先读取几张图片看一下头文件的信息(猜测)

如果患者为正面体位则 PatientOrientation == “L”,“R”

如果患者为侧面体位则 PatientOrientation == “A”,“P”

# python 多进程处理for循环
1. 定义迭代器 for i in ...
2. 将要做的事情分装成函数,最后output成一个值,如果需要多值的output可以设置成字典类型或者tuple类型
3. 使用多进程
初始化迭代器
items = [x for x in range(len(sepsis_imgs))]

定义函数
def check_PatientOrientation(i):
    ds = pydicom.dcmread(sepsis_imgs[i])
    if hasattr(ds, 'PatientOrientation'):
        ids = ds.PatientOrientation
    else:
        ids = ["yuansh","yuansh"]
    return ids

# 多进进程处理
p = multiprocessing.Pool(26)
check_list_only = p.map(check_PatientOrientation, items)
p.close() # 记得关闭
p.join()
np.save( OutPath+'PatientOrientation.npy',check_list_only)
# 获取 PatientOrientation 信息
PatientOrientation = list(np.load(OutPath+'PatientOrientation.npy',allow_pickle=True)) #PatientOrientation
# 获取左右信息
ids1 = []
ids2 = []
for i in PatientOrientation:
    if i == '':
        ids1.append('yuansh')
        ids2.append('yuansh')
    else :
        ids1.append(i[0])
        ids2.append(i[1])
        
        
Counter(ids2)
Counter(ids1)
Counter({'F': 75512, 'yuansh': 2158, 'R': 1, 'A': 1, 'L': 4, 'FP': 1})






Counter({'A': 9092,
         'L': 56863,
         'R': 4748,
         'P': 4805,
         'yuansh': 2158,
         'F': 3,
         'H': 3,
         'LP': 4,
         'LA': 1})
# 验证当有L,R存在时候,都为正面

import random


ids = np.where([i == 'A' for i in ids1])[0]
ids = [ids[random.randint(0,len(ids))] for i in range(8)] # 第一个L中随机抽取8个

f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
    img = imread(sepsis_imgs[ids[i]])
    ax[i//4, i%4].imshow(img, cmap='gray')
    if i<4:
        ax[i//4, i%4].set_title(str(i))
    else:
        ax[i//4, i%4].set_title(str(i))
    ax[i//4, i%4].axis('off')
    ax[i//4, i%4].set_aspect('auto')
plt.show();

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tqZvQr81-1619081821174)(output_19_0.svg)]

# 验证当有L,R存在时候,都为正面

import random


ids = np.where([i == 'P' for i in ids1])[0]
ids = [ids[random.randint(0,len(ids))] for i in range(8)] # 第一个L中随机抽取8个

f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
    img = imread(sepsis_imgs[ids[i]])
    ax[i//4, i%4].imshow(img, cmap='gray')
    if i<4:
        ax[i//4, i%4].set_title(str(i))
    else:
        ax[i//4, i%4].set_title(str(i))
    ax[i//4, i%4].axis('off')
    ax[i//4, i%4].set_aspect('auto')
plt.show();

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gTnfuaew-1619081821175)(output_20_0.svg)]

# 验证当有L,R存在时候,都为正面

import random


ids = np.where([i == 'R' for i in ids1])[0]
ids = [ids[random.randint(0,len(ids))] for i in range(8)] # 第一个R中随机抽取8个

f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
    img = imread(sepsis_imgs[ids[i]])
    ax[i//4, i%4].imshow(img, cmap='gray')
    if i<4:
        ax[i//4, i%4].set_title(str(i))
    else:
        ax[i//4, i%4].set_title(str(i))
    ax[i//4, i%4].axis('off')
    ax[i//4, i%4].set_aspect('auto')
plt.show();

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gTMe2RrB-1619081821176)(output_21_0.svg)]

# 验证当有L,R存在时候,都为正面

import random


ids = np.where([i == 'L' for i in ids1])[0]
ids = [ids[random.randint(0,len(ids))] for i in range(8)] # 第一个L中随机抽取8个

f, ax = plt.subplots(2,4, figsize=(12,4))
for i in range(8):
    img = imread(sepsis_imgs[ids[i]])
    ax[i//4, i%4].imshow(img, cmap='gray')
    if i<4:
        ax[i//4, i%4].set_title(str(i))
    else:
        ax[i//4, i%4].set_title(str(i))
    ax[i//4, i%4].axis('off')
    ax[i//4, i%4].set_aspect('auto')
plt.show();

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-q8vKDAUS-1619081821177)(output_22_0.svg)]

你可能感兴趣的:(生物医学深度学习实战)