测序后直接对fastq进行分析

##### 利用R作base frequency plot #####

library(Biostrings)

fastq <- readDNAStringSet("Sg2-30-filter-reverse-R2.fastq","fastq")

## At each position, base frequency

afmc=consensusMatrix(fastq, baseOnly=T,as.prob = T)

tafmc=t(afmc)

matplot(tafmc[,-5], main='Sg2-30-R2',type="l", lwd=2, xlab="Read Length", ylab= "Base frequency at each position", cex.lab=1.1)

legend(legend = colnames(tafmc)[-5],"top",col=1:4, lty=1:4, lwd=2)

####method2##library(seqTools)

library(seqTools)

# Reads fastq file

fq=fastqq("Sg2-30_L2_P707505.R1.clean.fastq.gz")

# Plots nucleotide frequency

plotNucFreq(fq,1)

####end######

##从5‘端开始切割###

cutadapt -j 15 -g ACGGCG -O 3 Sg2-0_L2_P706504.R1.clean.fastq> Sg2-0-cutleft.fastq

##从3’端开始切割###

cutadapt -j 15 -a AGATCGGAAGA -O 3 Sg2-0-cutleft.fastq> Sg2-0-cut-R1.fastq

###筛选reads长度####

awk 'BEGIN {OFS = "\n"} {header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= 2 && length(seq) <= 25) {print header, seq, qheader, qseq}}' < your.fastq> filtered.fastq

##取反向互补序列###

seqkit seq Sg2-30-filter-R2.fastq-r -p > Sg2-30-filter-reverse-R2.fastq

###切掉正数或倒数N个碱基(u取正负值分别代表正数或者倒数)####

cutadapt -j 18 -u -2 -o findmotif-sg2-30-R2.fastq motif-sg2-30-R2.fastq

####转换fastq文件为fasta文件#####

seqkit fq2fa findmotif-sg2-5-R2.fastq-o findmotif-sg2-5-R2.fa

####两个文件取交集(保留重复)#####

grep -F -f file1.txt file2.txt> overlap.txt

#####去重复########

sort overlap.txt| uniq > result.txt

#####在文件中搜索特殊字符串的出现行数####

grep -o TTCAGCCGCTACCCC findmotif-sg2-0-R1.fa| wc -l

#####删除指定字符串行#####

sed -i -e '/TTCAGCCGCTACCCC/d' grep5-10-30.txt

#####每隔N行添加字符串JJC#####

sed '0~N s/$/\nJJC/g' file.txt> result.txt

#####每隔N行在末尾添加#####

sed '0~N s/$/JJC/g' < inputfile > outputfile

#####每隔1行添加随机字符串(一般用来做fasta文件)

awk 'BEGIN{OFS="\n";i=1000}{ print ">ENLISH"i,$0}{i+=2}' chr.fa> chr.all.fa

#####meme找motif######

meme 5-10-30.fa-dna -o 5-10-30 -nmotifs 2 -mod oops -minw 4

#### 删除文件最后两个字符 ###

sed 's/\(.\)\{2\}$//'

#### 删除文件前两个字符(一个. 代表一个字符)

sed -i 's/^..//g'

你可能感兴趣的:(测序后直接对fastq进行分析)