【r<-方案】数据框ID拆分

一个样例数据:

> df
             gene_id gene_symbol gene_class
1  ENSG00000000003.9      TSPAN6     coding
2  ENSG00000000003.9      TSPAN6     coding
41 ENSG00000000005.5        TNMD     coding
79 ENSG00000000457.8       SCYL3     coding
                                                                                    microrna       seed_pos seed_type
1                                                                         miR-132/212/212-3p  chrX:99884666     8-mer
2                                                                                 miR-133abc  chrX:99884907  7-mer-A1
41 miR-93/93a/105/106a/291a-3p/294/295/302abcde/372/373/428/519a/520be/520acd-3p/1378/1420ac  chrX:99840338  7-mer-m8
79                                                                        let-7/98/4458/4500 chr1:169822790  7-mer-A1
   repeat. total_cons_. primates_cons_. mammals_cons_. vertebrates_cons_. tr_region
1        0           13              67              0                  0    3pUTR,
2        0           11              33              9                  0    3pUTR,
41       0           42              89             39                 15      CDS,
79       0           76             100             91                 31    3pUTR,

目标是将microrna的id进行拆分,得到下面的:

 df_res$MicroRNA
  [1] "miR-132-3p"               "miR-212-3p"               "miR-133abc"               "miR-93-3p-3p"            
  [5] "miR-93-3p-1378"           "miR-93-3p-1420ac"         "miR-93-294-3p"            "miR-93-294-1378"         
  [9] "miR-93-294-1420ac"        "miR-93-295-3p"            "miR-93-295-1378"          "miR-93-295-1420ac"       
 [13] "miR-93-302abcde-3p"       "miR-93-302abcde-1378"     "miR-93-302abcde-1420ac"   "miR-93-372-3p"           
 [17] "miR-93-372-1378"          "miR-93-372-1420ac"        "miR-93-373-3p"            "miR-93-373-1378"         
 [21] "miR-93-373-1420ac"        "miR-93-428-3p"            "miR-93-428-1378"          "miR-93-428-1420ac"       
 [25] "miR-93-519a-3p"           "miR-93-519a-1378"         "miR-93-519a-1420ac"       "miR-93-520be-3p"         
 [29] "miR-93-520be-1378"        "miR-93-520be-1420ac"      "miR-93-520acd-3p"         "miR-93-520acd-1378"      
 [33] "miR-93-520acd-1420ac"     "miR-93a-3p-3p"            "miR-93a-3p-1378"          "miR-93a-3p-1420ac"       
 [37] "miR-93a-294-3p"           "miR-93a-294-1378"         "miR-93a-294-1420ac"       "miR-93a-295-3p"          
 [41] "miR-93a-295-1378"         "miR-93a-295-1420ac"       "miR-93a-302abcde-3p"      "miR-93a-302abcde-1378"   
 [45] "miR-93a-302abcde-1420ac"  "miR-93a-372-3p"           "miR-93a-372-1378"         "miR-93a-372-1420ac"      
 [49] "miR-93a-373-3p"           "miR-93a-373-1378"         "miR-93a-373-1420ac"       "miR-93a-428-3p"          
 [53] "miR-93a-428-1378"         "miR-93a-428-1420ac"       "miR-93a-519a-3p"          "miR-93a-519a-1378"       
 [57] "miR-93a-519a-1420ac"      "miR-93a-520be-3p"         "miR-93a-520be-1378"       "miR-93a-520be-1420ac"    
 [61] "miR-93a-520acd-3p"        "miR-93a-520acd-1378"      "miR-93a-520acd-1420ac"    "miR-105-3p-3p"           
 [65] "miR-105-3p-1378"          "miR-105-3p-1420ac"        "miR-105-294-3p"           "miR-105-294-1378"        
 [69] "miR-105-294-1420ac"       "miR-105-295-3p"           "miR-105-295-1378"         "miR-105-295-1420ac"      
 [73] "miR-105-302abcde-3p"      "miR-105-302abcde-1378"    "miR-105-302abcde-1420ac"  "miR-105-372-3p"          
 [77] "miR-105-372-1378"         "miR-105-372-1420ac"       "miR-105-373-3p"           "miR-105-373-1378"        
 [81] "miR-105-373-1420ac"       "miR-105-428-3p"           "miR-105-428-1378"         "miR-105-428-1420ac"      
 [85] "miR-105-519a-3p"          "miR-105-519a-1378"        "miR-105-519a-1420ac"      "miR-105-520be-3p"        
 [89] "miR-105-520be-1378"       "miR-105-520be-1420ac"     "miR-105-520acd-3p"        "miR-105-520acd-1378"     
 [93] "miR-105-520acd-1420ac"    "miR-106a-3p-3p"           "miR-106a-3p-1378"         "miR-106a-3p-1420ac"      
 [97] "miR-106a-294-3p"          "miR-106a-294-1378"        "miR-106a-294-1420ac"      "miR-106a-295-3p"         
[101] "miR-106a-295-1378"        "miR-106a-295-1420ac"      "miR-106a-302abcde-3p"     "miR-106a-302abcde-1378"  
[105] "miR-106a-302abcde-1420ac" "miR-106a-372-3p"          "miR-106a-372-1378"        "miR-106a-372-1420ac"     
[109] "miR-106a-373-3p"          "miR-106a-373-1378"        "miR-106a-373-1420ac"      "miR-106a-428-3p"         
[113] "miR-106a-428-1378"        "miR-106a-428-1420ac"      "miR-106a-519a-3p"         "miR-106a-519a-1378"      
[117] "miR-106a-519a-1420ac"     "miR-106a-520be-3p"        "miR-106a-520be-1378"      "miR-106a-520be-1420ac"   
[121] "miR-106a-520acd-3p"       "miR-106a-520acd-1378"     "miR-106a-520acd-1420ac"   "miR-291a-3p-3p"          
[125] "miR-291a-3p-1378"         "miR-291a-3p-1420ac"       "miR-291a-294-3p"          "miR-291a-294-1378"       
[129] "miR-291a-294-1420ac"      "miR-291a-295-3p"          "miR-291a-295-1378"        "miR-291a-295-1420ac"     
[133] "miR-291a-302abcde-3p"     "miR-291a-302abcde-1378"   "miR-291a-302abcde-1420ac" "miR-291a-372-3p"         
[137] "miR-291a-372-1378"        "miR-291a-372-1420ac"      "miR-291a-373-3p"          "miR-291a-373-1378"       
[141] "miR-291a-373-1420ac"      "miR-291a-428-3p"          "miR-291a-428-1378"        "miR-291a-428-1420ac"     
[145] "miR-291a-519a-3p"         "miR-291a-519a-1378"       "miR-291a-519a-1420ac"     "miR-291a-520be-3p"       
[149] "miR-291a-520be-1378"      "miR-291a-520be-1420ac"    "miR-291a-520acd-3p"       "miR-291a-520acd-1378"    
[153] "miR-291a-520acd-1420ac"   "let-7"                    "let-98"                   "let-4458"                
[157] "let-4500"  

方案:

  • 先按-拆分为多列
  • 再按/拆分为多行
  • 然后合并列,用-分隔

代码:

library(tidyverse)
df <- str_split(df$microrna, pattern = "-", simplify = TRUE) %>% 
    as_tibble() %>% 
    bind_cols(df) 


iter_split = function(df, pattern = "^V", sep = "/") {
    iter_cols = grep(pattern, colnames(df), value = TRUE)
    
    res = new.env(parent = emptyenv())
    res$df = df
    
    for (i in iter_cols) {
        if (any(str_detect(df[[i]], pattern = sep))) {
            res$df = res$df %>% separate_rows_(i)
        }
    }
    
    return(res$df)
}

df
df_res = iter_split(df)
df_res


df_res$MicroRNA = 
    apply(df_res[startsWith(colnames(df_res),"V")], 1, function(x) {
        x = x[x != ""]
        paste(x, collapse = "-")
    })

df_res = unique(df_res)

# df_res %>% 
#     unite(MicroRNA, starts_with("V"), sep = "-") %>% 
#     pull(MicroRNA)

上面的搞错了,只有斜线才是分割线,下面备注下处理。

df <- df %>% 
  mutate(Prefix = substring(microrna, 1, 3),
         V2 = substring(microrna, 5)) %>% 
  separate_rows(V2, sep = "/") %>% 
  mutate(res = paste(Prefix, V2, sep = "-"))

> head(df)
            gene_id gene_symbol gene_class                                                                                  microrna
1 ENSG00000000003.9      TSPAN6     coding                                                                        miR-132/212/212-3p
2 ENSG00000000003.9      TSPAN6     coding                                                                        miR-132/212/212-3p
3 ENSG00000000003.9      TSPAN6     coding                                                                        miR-132/212/212-3p
4 ENSG00000000003.9      TSPAN6     coding                                                                                miR-133abc
5 ENSG00000000005.5        TNMD     coding miR-93/93a/105/106a/291a-3p/294/295/302abcde/372/373/428/519a/520be/520acd-3p/1378/1420ac
6 ENSG00000000005.5        TNMD     coding miR-93/93a/105/106a/291a-3p/294/295/302abcde/372/373/428/519a/520be/520acd-3p/1378/1420ac
       seed_pos seed_type repeat. total_cons_. primates_cons_. mammals_cons_. vertebrates_cons_. tr_region Prefix     V2        res
1 chrX:99884666     8-mer       0           13              67              0                  0    3pUTR,    miR    132    miR-132
2 chrX:99884666     8-mer       0           13              67              0                  0    3pUTR,    miR    212    miR-212
3 chrX:99884666     8-mer       0           13              67              0                  0    3pUTR,    miR 212-3p miR-212-3p
4 chrX:99884907  7-mer-A1       0           11              33              9                  0    3pUTR,    miR 133abc miR-133abc
5 chrX:99840338  7-mer-m8       0           42              89             39                 15      CDS,    miR     93     miR-93
6 chrX:99840338  7-mer-m8       0           42              89             39                 15      CDS,    miR    93a    miR-93a

你可能感兴趣的:(【r<-方案】数据框ID拆分)