R语言:利用rhdf5包分别创建单组学,多组学.h5文件

最近在跑的方法需要.h5作为输入文件,就学习了一下怎么创建.h5文件,原有数据结构如下:
R语言:利用rhdf5包分别创建单组学,多组学.h5文件_第1张图片
R语言:利用rhdf5包分别创建单组学,多组学.h5文件_第2张图片

生成单组学.h5文件的代码如下:

library(rhdf5)
library(dplyr)
library(patchwork)
rna.data <- Read10X(data.dir='Processed_dataset/RNA',gene.column = 1)
atac.data <- Read10X(data.dir='Processed_dataset/ATAC',gene.column = 1)

dim(rna.data)
dim(atac.data)
###构建.h5文件
h5createFile("SNARE_cellline_atac.h5")
# Saving matrix information.
h5createGroup("SNARE_cellline_atac.h5","matrix")
h5write(atac.data@Dimnames[[2]] , "SNARE_cellline_atac.h5", "matrix/barcodes")
h5write(atac.data@x, "SNARE_cellline_atac.h5", "matrix/data")
dim(atac.data)
h5createGroup("SNARE_cellline_atac.h5","matrix/features")
h5write("genome", "SNARE_cellline_atac.h5", "matrix/features/_all_tag_keys")
Peaks <- rep("Peaks", length(atac.data@Dimnames[[1]]))
h5write(Peaks,"SNARE_cellline_atac.h5", "matrix/features/feature_type")
Genome <- rep("hg38", length(atac.data@Dimnames[[1]]))
h5write(Genome,"SNARE_cellline_atac.h5", "matrix/features/genome")
h5write(atac.data@Dimnames[[1]],"SNARE_cellline_atac.h5", "matrix/features/id")
h5write(atac.data@Dimnames[[1]],"SNARE_cellline_atac.h5", "matrix/features/name")
h5write(atac.data@i, "SNARE_cellline_atac.h5", "matrix/indices") # already zero-indexed.
h5write(atac.data@p, "SNARE_cellline_atac.h5", "matrix/indptr")
h5write(dim(atac.data), "SNARE_cellline_atac.h5", "matrix/shape")
h5closeAll()
atac_file= H5Fopen("SNARE_cellline_atac.h5")
atac_h5 <- h5dump(atac_file,load=FALSE)
##############################################################################
###构建.h5文件
h5createFile("SNARE_cellline_rna.h5")
# Saving matrix information.
h5createGroup("SNARE_cellline_rna.h5","matrix")
h5write(rna.data@Dimnames[[2]] , "SNARE_cellline_rna.h5", "matrix/barcodes")
h5write(rna.data@x, "SNARE_cellline_rna.h5", "matrix/data")
h5createGroup("SNARE_cellline_rna.h5","matrix/features")
h5write("genome", "SNARE_cellline_rna.h5", "matrix/features/_all_tag_keys")
Genes <- rep("Gene Expression", length(rna.data@Dimnames[[1]]))
h5write(Genes,"SNARE_cellline_rna.h5", "matrix/features/feature_type")
Genome <- rep("hg38", length(rna.data@Dimnames[[1]]))
h5write(Genome,"SNARE_cellline_rna.h5", "matrix/features/genome")
h5write(rna.data@Dimnames[[1]],"SNARE_cellline_rna.h5", "matrix/features/id")
h5write(rna.data@Dimnames[[1]],"SNARE_cellline_rna.h5", "matrix/features/name")
h5write(rna.data@i, "SNARE_cellline_rna.h5", "matrix/indices") # already zero-indexed.
h5write(rna.data@p, "SNARE_cellline_rna.h5", "matrix/indptr")
h5write(dim(rna.data), "SNARE_cellline_rna.h5", "matrix/shape")
h5closeAll()
rna_file= H5Fopen("SNARE_cellline_rna.h5")
rna_h5 <- h5dump(rna_file,load=FALSE)

创建好的.h5文件结构如下,和10X提供的.h5文件结构是一样的
R语言:利用rhdf5包分别创建单组学,多组学.h5文件_第3张图片
生成多组学.h5文件的代码如下:

library(rhdf5)
library(dplyr)
library(Seurat)
library(patchwork)
library(reticulate)
rna.data <- Read10X(data.dir='Processed_dataset/RNA',gene.column = 1)
atac.data <- Read10X(data.dir='Processed_dataset/ATAC',gene.column = 1)

length(rna.data@Dimnames[[2]])
cell_name <- rna.data@Dimnames[[2]]
new_cell_name <- sample(cell_name,size= 0.2*length(rna.data@Dimnames[[2]]))


rna.data <- rna.data[,rna.data@Dimnames[[2]] %in% new_cell_name]
atac.data <- atac.data[,atac.data@Dimnames[[2]] %in% new_cell_name]
##########################################################################################
multi.data <- rbind(rna.data,atac.data)
###构建.h5文件
h5createFile("SNARE_cellline_train.h5")
# Saving matrix information.
h5createGroup("SNARE_cellline_train.h5","matrix")
h5write(multi.data@Dimnames[[2]] , "SNARE_cellline_train.h5", "matrix/barcodes")
h5write(multi.data@x, "SNARE_cellline_train.h5", "matrix/data")


h5createGroup("SNARE_cellline_train.h5","matrix/features")
key <- c('genome','interval')
h5write(key, "SNARE_cellline_train.h5", "matrix/features/_all_tag_keys")
Genes <- rep('Gene Expression', length(rna.data@Dimnames[[1]]))
Peaks <- rep("Peaks", length(atac.data@Dimnames[[1]]))
Features <- c(Genes,Peaks)
h5write(Features,"SNARE_cellline_train.h5", "matrix/features/feature_type")
Genome <- rep("GRCh38", length(multi.data@Dimnames[[1]]))
h5write(Genome,"SNARE_cellline_train.h5", "matrix/features/genome")
h5write(multi.data@Dimnames[[1]],"SNARE_cellline_train.h5", "matrix/features/id")
# cc <- c()
# h5write(cc,"SNARE_cellline_train.h5", "matrix/features/interval")
h5write(multi.data@Dimnames[[1]],"SNARE_cellline_train.h5", "matrix/features/name")
h5write(multi.data@i, "SNARE_cellline_train.h5", "matrix/indices") # already zero-indexed.
h5write(multi.data@p, "SNARE_cellline_train.h5", "matrix/indptr")
h5write(dim(multi.data), "SNARE_cellline_train.h5", "matrix/shape")
h5closeAll()
multi_file= H5Fopen("SNARE_cellline_train.h5")
multi_h5 <- h5dump(multi_file,load=FALSE)

你可能感兴趣的:(R,数据分析,r语言,生物信息)