一个样例数据:
> df
gene_id gene_symbol gene_class
1 ENSG00000000003.9 TSPAN6 coding
2 ENSG00000000003.9 TSPAN6 coding
41 ENSG00000000005.5 TNMD coding
79 ENSG00000000457.8 SCYL3 coding
microrna seed_pos seed_type
1 miR-132/212/212-3p chrX:99884666 8-mer
2 miR-133abc chrX:99884907 7-mer-A1
41 miR-93/93a/105/106a/291a-3p/294/295/302abcde/372/373/428/519a/520be/520acd-3p/1378/1420ac chrX:99840338 7-mer-m8
79 let-7/98/4458/4500 chr1:169822790 7-mer-A1
repeat. total_cons_. primates_cons_. mammals_cons_. vertebrates_cons_. tr_region
1 0 13 67 0 0 3pUTR,
2 0 11 33 9 0 3pUTR,
41 0 42 89 39 15 CDS,
79 0 76 100 91 31 3pUTR,
目标是将microrna的id进行拆分,得到下面的:
df_res$MicroRNA
[1] "miR-132-3p" "miR-212-3p" "miR-133abc" "miR-93-3p-3p"
[5] "miR-93-3p-1378" "miR-93-3p-1420ac" "miR-93-294-3p" "miR-93-294-1378"
[9] "miR-93-294-1420ac" "miR-93-295-3p" "miR-93-295-1378" "miR-93-295-1420ac"
[13] "miR-93-302abcde-3p" "miR-93-302abcde-1378" "miR-93-302abcde-1420ac" "miR-93-372-3p"
[17] "miR-93-372-1378" "miR-93-372-1420ac" "miR-93-373-3p" "miR-93-373-1378"
[21] "miR-93-373-1420ac" "miR-93-428-3p" "miR-93-428-1378" "miR-93-428-1420ac"
[25] "miR-93-519a-3p" "miR-93-519a-1378" "miR-93-519a-1420ac" "miR-93-520be-3p"
[29] "miR-93-520be-1378" "miR-93-520be-1420ac" "miR-93-520acd-3p" "miR-93-520acd-1378"
[33] "miR-93-520acd-1420ac" "miR-93a-3p-3p" "miR-93a-3p-1378" "miR-93a-3p-1420ac"
[37] "miR-93a-294-3p" "miR-93a-294-1378" "miR-93a-294-1420ac" "miR-93a-295-3p"
[41] "miR-93a-295-1378" "miR-93a-295-1420ac" "miR-93a-302abcde-3p" "miR-93a-302abcde-1378"
[45] "miR-93a-302abcde-1420ac" "miR-93a-372-3p" "miR-93a-372-1378" "miR-93a-372-1420ac"
[49] "miR-93a-373-3p" "miR-93a-373-1378" "miR-93a-373-1420ac" "miR-93a-428-3p"
[53] "miR-93a-428-1378" "miR-93a-428-1420ac" "miR-93a-519a-3p" "miR-93a-519a-1378"
[57] "miR-93a-519a-1420ac" "miR-93a-520be-3p" "miR-93a-520be-1378" "miR-93a-520be-1420ac"
[61] "miR-93a-520acd-3p" "miR-93a-520acd-1378" "miR-93a-520acd-1420ac" "miR-105-3p-3p"
[65] "miR-105-3p-1378" "miR-105-3p-1420ac" "miR-105-294-3p" "miR-105-294-1378"
[69] "miR-105-294-1420ac" "miR-105-295-3p" "miR-105-295-1378" "miR-105-295-1420ac"
[73] "miR-105-302abcde-3p" "miR-105-302abcde-1378" "miR-105-302abcde-1420ac" "miR-105-372-3p"
[77] "miR-105-372-1378" "miR-105-372-1420ac" "miR-105-373-3p" "miR-105-373-1378"
[81] "miR-105-373-1420ac" "miR-105-428-3p" "miR-105-428-1378" "miR-105-428-1420ac"
[85] "miR-105-519a-3p" "miR-105-519a-1378" "miR-105-519a-1420ac" "miR-105-520be-3p"
[89] "miR-105-520be-1378" "miR-105-520be-1420ac" "miR-105-520acd-3p" "miR-105-520acd-1378"
[93] "miR-105-520acd-1420ac" "miR-106a-3p-3p" "miR-106a-3p-1378" "miR-106a-3p-1420ac"
[97] "miR-106a-294-3p" "miR-106a-294-1378" "miR-106a-294-1420ac" "miR-106a-295-3p"
[101] "miR-106a-295-1378" "miR-106a-295-1420ac" "miR-106a-302abcde-3p" "miR-106a-302abcde-1378"
[105] "miR-106a-302abcde-1420ac" "miR-106a-372-3p" "miR-106a-372-1378" "miR-106a-372-1420ac"
[109] "miR-106a-373-3p" "miR-106a-373-1378" "miR-106a-373-1420ac" "miR-106a-428-3p"
[113] "miR-106a-428-1378" "miR-106a-428-1420ac" "miR-106a-519a-3p" "miR-106a-519a-1378"
[117] "miR-106a-519a-1420ac" "miR-106a-520be-3p" "miR-106a-520be-1378" "miR-106a-520be-1420ac"
[121] "miR-106a-520acd-3p" "miR-106a-520acd-1378" "miR-106a-520acd-1420ac" "miR-291a-3p-3p"
[125] "miR-291a-3p-1378" "miR-291a-3p-1420ac" "miR-291a-294-3p" "miR-291a-294-1378"
[129] "miR-291a-294-1420ac" "miR-291a-295-3p" "miR-291a-295-1378" "miR-291a-295-1420ac"
[133] "miR-291a-302abcde-3p" "miR-291a-302abcde-1378" "miR-291a-302abcde-1420ac" "miR-291a-372-3p"
[137] "miR-291a-372-1378" "miR-291a-372-1420ac" "miR-291a-373-3p" "miR-291a-373-1378"
[141] "miR-291a-373-1420ac" "miR-291a-428-3p" "miR-291a-428-1378" "miR-291a-428-1420ac"
[145] "miR-291a-519a-3p" "miR-291a-519a-1378" "miR-291a-519a-1420ac" "miR-291a-520be-3p"
[149] "miR-291a-520be-1378" "miR-291a-520be-1420ac" "miR-291a-520acd-3p" "miR-291a-520acd-1378"
[153] "miR-291a-520acd-1420ac" "let-7" "let-98" "let-4458"
[157] "let-4500"
方案:
- 先按
-
拆分为多列 - 再按
/
拆分为多行 - 然后合并列,用
-
分隔
代码:
library(tidyverse)
df <- str_split(df$microrna, pattern = "-", simplify = TRUE) %>%
as_tibble() %>%
bind_cols(df)
iter_split = function(df, pattern = "^V", sep = "/") {
iter_cols = grep(pattern, colnames(df), value = TRUE)
res = new.env(parent = emptyenv())
res$df = df
for (i in iter_cols) {
if (any(str_detect(df[[i]], pattern = sep))) {
res$df = res$df %>% separate_rows_(i)
}
}
return(res$df)
}
df
df_res = iter_split(df)
df_res
df_res$MicroRNA =
apply(df_res[startsWith(colnames(df_res),"V")], 1, function(x) {
x = x[x != ""]
paste(x, collapse = "-")
})
df_res = unique(df_res)
# df_res %>%
# unite(MicroRNA, starts_with("V"), sep = "-") %>%
# pull(MicroRNA)
上面的搞错了,只有斜线才是分割线,下面备注下处理。
df <- df %>%
mutate(Prefix = substring(microrna, 1, 3),
V2 = substring(microrna, 5)) %>%
separate_rows(V2, sep = "/") %>%
mutate(res = paste(Prefix, V2, sep = "-"))
> head(df)
gene_id gene_symbol gene_class microrna
1 ENSG00000000003.9 TSPAN6 coding miR-132/212/212-3p
2 ENSG00000000003.9 TSPAN6 coding miR-132/212/212-3p
3 ENSG00000000003.9 TSPAN6 coding miR-132/212/212-3p
4 ENSG00000000003.9 TSPAN6 coding miR-133abc
5 ENSG00000000005.5 TNMD coding miR-93/93a/105/106a/291a-3p/294/295/302abcde/372/373/428/519a/520be/520acd-3p/1378/1420ac
6 ENSG00000000005.5 TNMD coding miR-93/93a/105/106a/291a-3p/294/295/302abcde/372/373/428/519a/520be/520acd-3p/1378/1420ac
seed_pos seed_type repeat. total_cons_. primates_cons_. mammals_cons_. vertebrates_cons_. tr_region Prefix V2 res
1 chrX:99884666 8-mer 0 13 67 0 0 3pUTR, miR 132 miR-132
2 chrX:99884666 8-mer 0 13 67 0 0 3pUTR, miR 212 miR-212
3 chrX:99884666 8-mer 0 13 67 0 0 3pUTR, miR 212-3p miR-212-3p
4 chrX:99884907 7-mer-A1 0 11 33 9 0 3pUTR, miR 133abc miR-133abc
5 chrX:99840338 7-mer-m8 0 42 89 39 15 CDS, miR 93 miR-93
6 chrX:99840338 7-mer-m8 0 42 89 39 15 CDS, miR 93a miR-93a