这里是网页版获取DNA序列,下载保存后可以用read.fasta
打开
##########################
用SeqinR包获取序列并进行统计
##########################
比如,在NCBI获取NC_001477登革病毒的基因组序列,
安装加载seqinr
包
install.packages("seqinr")
library(seqinr)
choosebank()#list all the sub-database in ACNUC
> choosebank()
[1] "genbank" "embl" "emblwgs" "swissprot"
[5] "ensembl" "hogenom7" "hogenom" "hogenomdna"
[9] "hovergendna" "hovergen" "hogenom5" "hogenom5dna"
[13] "hogenom4" "hogenom4dna" "homolens" "homolensdna"
[17] "hobacnucl" "hobacprot" "phever2" "phever2dna"
[21] "refseq" "refseq16s" "greviews" "bacterial"
[25] "archaeal" "protozoan" "ensprotists" "ensfungi"
[29] "ensmetazoa" "ensplants" "ensemblbacteria" "mito"
[33] "polymorphix" "emglib" "refseqViruses" "ribodb"
[37] "taxodb"
"genebank"包含DNA和RNA序列数据库
“refseq”包含"Refseq”中DNA和RNA
"refseqViruses”包含Refseq中病毒的DNA,RNA和蛋白序列
更详细的见http://doua.prabi.fr/databases/acnuc
比如要获取DEN-1登革病毒基因组序列,accesion number NC_001477
1 构造一个函数,由Accession number直接下载所需要的序列
getncbiseq <- function(accession)
{
require("seqinr") # this function requires the SeqinR R package
# first find which ACNUC database the accession is stored in:
dbs <- c("genbank","refseq","refseqViruses","bacterial")
numdbs <- length(dbs)
for (i in 1:numdbs)
{
db <- dbs[i]
choosebank(db)
# check if the sequence is in ACNUC database 'db':
resquery <- try(query(".tmpquery", paste("AC=", accession)), silent = TRUE)
if (!(inherits(resquery, "try-error")))
{
queryname <- "query2"
thequery <- paste("AC=",accession,sep="")
query2 <- query(queryname, thequery)
# see if a sequence was retrieved:
seq <- getSequence(query2$req[[1]])
closebank()
return(seq)
}
closebank()
}
print(paste("ERROR: accession",accession,"was not found"))
}
2.根据accession number下载序列
dengueseq <- getncbiseq("NC_001477")
> dengueseq
[1] "a" "g" "t" "t" "g" "t" "t" "a" "g" "t" "c" "t" "a" "c" "g" "t" "g" "g" "a"
[20] "c" "c" "g" "a" "c" "a" "a" "g" "a" "a" "c" "a" "g" "t" "t" "t" "c" "g" "a"
[39] "a" "t" "c" "g" "g" "a" "a" "g" "c" "t" "t" "g" "c" "t" "t" "a" "a" "c" "g"
[58] "t" "a" "g" "t" "t" "c" "t" "a" "a" "c" "a" "g" "t" "t" "t" "t" "t" "t" "a"
[77] "t" "t" "a" "g" "a" "g" "a" "g" "c" "a" "g" "a" "t" "c" "t" "c" "t" "g" "a"
[96] "t" "g" "a" "a" "c" "a" "a" "c" "c" "a" "a" "c" "g" "g" "a" "a" "a" "a" "a"
3 输出fasta格式文件
write.fasta(names="DEN-1", sequences=dengueseq, file.out="den1.fasta")
4读入,如果通过网页直接下载序列,可以直接从这一步开始进行导入
dengue <- read.fasta(file = "den1.fasta")
dengueseq <- dengue[[1]]
5.1DNA length
> length(dengueseq)
[1] 10735
5.2Base composition
> table(dengueseq)
dengueseq
a c g t
3426 2240 2770 2299
等同于
count(dengueseq,1)
a c g t
3426 2240 2770 2299
5.3GC Content
> GC(dengueseq)
[1] 0.4666977
5.4 DNA words
> count(dengueseq,2)
aa ac ag at ca cc cg ct ga gc gg gt ta tc tg tt
1108 720 890 708 901 523 261 555 976 500 787 507 440 497 832 529
5.5写一个函数计算AT含量
AT <- function(inputseq)
{
mytable <- count(inputseq,1)#make a table with the count of As,Cs,Gs,Ts
mylength <- length(inputseq)#find the length of the whole sequence
myAs <- mytable[[1]]#number of As in the sequence
myTs <- mytable[[4]]#nmmber of Ts in the sequence
myAT <- (myAs+myTs)/mylength
return(myAT)
}
计算denggueseq的AT含量
> AT(dengueseq)
[1] 0.5333023
5.6统计三联体密码一共多少个
> count(dengueseq,3) %>% sum
[1] 10733
参考文章:A Little Book of R For Bioinformatics, Release 0.1