经过前几天的学习,终于到了文本处理完结篇,就是站在前人的肩膀上处理自己的数据,如下黄框中的数据(GSE178341),处理数据的过程艰辛异常,主要是没有R语言的思维,代码看得懂,但是不会用,只能照葫芦画瓢。
打开GEO界面,一共181个样本,所有的样本都在1个hd5文件中,从181个样本中统计其中8个样本,问题:1、怎样读取hd5文件 2、拆分逻辑(和上篇有异曲同工之妙)
OK,让我们开始代码之旅吧!
下载和读取数据
1、下载数据
wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE178nnn/GSE178341/suppl/GSE178341_crc10x_full_c295v4_submit.h5
2、读取hd5文件
tips:关于hdf5r包的安装,之前无论用install还是BiocManager都缺少文件,后来险中求胜,用了conda安装,引用了下述的安装网址
R Hdf5R :: Anaconda.org
上篇有说过,在 Seurat 中,对于 Cellranger 数据的导入,它提供了两个函数 Read10X 和 Read10X_h5。前者用于读取 Cellranger 生成的 3 个文件:barcodes.tsv,features.tsv,matrix.mtx,后者则用于读取生成的 xxxxxx.h5文件,这里我们应用Read10X_h5函数就行
R
library(Seurat)
a <- Read10X_h5("/home/shpc_a98ef85f8f/SingleronTest/wuxuan/Jun/GSE178341_crc10x_full_c295v4_submit.h5")
a <- CreateSeuratObject(a)
head([email protected], 8)
orig.ident nCount_RNA nFeature_RNA
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT C103 15093 2953
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT C103 1156 407
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG C103 50243 6522
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT C103 16254 3363
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC C103 42007 6151
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT C103 1937 898
C103_T_1_1_0_c1_v2_id-AAACCTGTCGTACGGC C103 1812 659
C103_T_1_1_0_c1_v2_id-AAACCTGTCTTGCAAG C103 6816 1300
tail([email protected], 8)
orig.ident nCount_RNA nFeature_RNA
C173_T_0_0_0_c1_v3_id-TTTGGAGAGGGCGAGA C173 1400 570
C173_T_0_0_0_c1_v3_id-TTTGGAGAGTGATAAC C173 1676 860
C173_T_0_0_0_c1_v3_id-TTTGGAGCAGCACGAA C173 10529 2727
C173_T_0_0_0_c1_v3_id-TTTGGAGTCATCGGGC C173 10047 3009
C173_T_0_0_0_c1_v3_id-TTTGGAGTCTAGTGTG C173 19533 4219
C173_T_0_0_0_c1_v3_id-TTTGTTGCAGCAATTC C173 1723 772
C173_T_0_0_0_c1_v3_id-TTTGTTGGTTCTGAGT C173 1267 506
C173_T_0_0_0_c1_v3_id-TTTGTTGTCGTTCCCA C173 39137 5151
table([email protected]$orig.ident) #查看细胞数?
C103 C104 C105 C106 C107 C109 C110 C111 C112 C113 C114 C115 C116
6096 2886 3281 3392 7215 4244 5033 5044 6425 5088 4553 1995 5234
C118 C119 C122 C123 C124 C125 C126 C129 C130 C132 C133 C134 C135
3211 1534 7060 14572 16945 12660 12914 12577 11327 7570 3853 7675 5621
C136 C137 C138 C139 C140 C142 C143 C144 C145 C146 C147 C149 C150
6609 5774 3408 10468 6179 5140 6359 3899 3091 4731 3801 4015 2418
C151 C152 C153 C154 C155 C156 C157 C158 C159 C160 C161 C162 C163
3873 3224 4860 3049 6448 3975 8152 2352 2437 3826 3755 22587 4717
C164 C165 C166 C167 C168 C169 C170 C171 C172 C173
3760 4524 5527 1443 2552 2245 12522 16199 1542 2649
拆分数据
1、利用tidyverse函数拆分数据
library(tidyverse)
[email protected]$BC <- rownames([email protected]) #增加一列BC
head([email protected])
orig.ident nCount_RNA nFeature_RNA
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT C103 15093 2953
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT C103 1156 407
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG C103 50243 6522
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT C103 16254 3363
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC C103 42007 6151
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT C103 1937 898
BC
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT
2、继续拆分
str_split(a$BC,'_id-',simplify=T)
[49995,] "C114_N_1_1_2_c1_v2" "TAAGCGTTCGTACGGC"
[49996,] "C114_N_1_1_2_c1_v2" "TAAGTGCCACGGTAAG"
[49997,] "C114_N_1_1_2_c1_v2" "TAAGTGCCAGCTGTAT"
[49998,] "C114_N_1_1_2_c1_v2" "TAAGTGCGTTACGTCA"
[49999,] "C114_N_1_1_2_c1_v2" "TAAGTGCTCATCTGCC"
[ reached getOption("max.print") -- omitted 320116 rows ]
str_split(a$BC,'_id-',simplify=T) %>% head()
[,1] [,2]
[1,] "C103_T_1_1_0_c1_v2" "AAACCTGCATGCTAGT"
[2,] "C103_T_1_1_0_c1_v2" "AAACCTGGTAGCCTAT"
[3,] "C103_T_1_1_0_c1_v2" "AAACCTGGTTGTCGCG"
[4,] "C103_T_1_1_0_c1_v2" "AAACCTGTCATGTGGT"
[5,] "C103_T_1_1_0_c1_v2" "AAACCTGTCCTTGGTC"
[6,] "C103_T_1_1_0_c1_v2" "AAACCTGTCGGATGTT"
a$Sample<- str_split(a$BC,'_id-',simplify=T)[,1]
a$rawBC <- str_split(a$BC,'_id-',simplify=T)[,2]
head([email protected])
orig.ident nCount_RNA nFeature_RNA
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT C103 15093 2953
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT C103 1156 407
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG C103 50243 6522
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT C103 16254 3363
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC C103 42007 6151
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT C103 1937 898
BC
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT
Sample rawBC
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT C103_T_1_1_0_c1_v2 AAACCTGCATGCTAGT
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT C103_T_1_1_0_c1_v2 AAACCTGGTAGCCTAT
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG C103_T_1_1_0_c1_v2 AAACCTGGTTGTCGCG
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT C103_T_1_1_0_c1_v2 AAACCTGTCATGTGGT
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC C103_T_1_1_0_c1_v2 AAACCTGTCCTTGGTC
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT C103_T_1_1_0_c1_v2 AAACCTGTCGGATGTT
Leo <- SplitObject(a,split.by='Sample')
Leo
$C172_T_0_0_0_c1_v3
An object of class Seurat
43113 features across 1542 samples within 1 assay
Active assay: RNA (43113 features, 0 variable features)
$C173_T_0_0_0_c1_v3
An object of class Seurat
43113 features across 2649 samples within 1 assay
Active assay: RNA (43113 features, 0 variable features)
至此,数据拆分结束
统计数据
Leo$C162_N_0_0_0_c2_v2 -> GSM5388125
median(GSM5388125 @meta.data$nFeature_RNA)
[1] 944
median(GSM5388125 @meta.data$nCount_RNA)
[1] 3715.5
GSM5388125[["percent.mt"]] <- PercentageFeatureSet( GSM5388125, pattern = "^MT-")
median([email protected]$percent.mt)
[1] 5.834534
MT5 <- subset(GSM5388125,percent.mt < 5)
MT5
An object of class Seurat
43113 features across 1357 samples within 1 assay
Active assay: RNA (43113 features, 0 variable features)
MT30 <- subset(GSM5388125,percent.mt < 30)
MT30
An object of class Seurat
43113 features across 2880 samples within 1 assay
Active assay: RNA (43113 features, 0 variable features)
终于统计完了 这周KPI圆满结束 ,下周继续更新富集分析
周末愉快