在任意文件夹下面创建 folder1~5这5个文件夹,然后每个文件夹下面继续创建 folder1~5这5个文件夹
zy@VM-0-15-ubuntu:~/tmp$ mkdir -p folder_{1..5}/folder_{1..5}
zy@VM-0-15-ubuntu:~$ tree tmp
tmp
├── folder_1
│ ├── folder_1
│ ├── folder_2
│ ├── folder_3
│ ├── folder_4
│ └── folder_5
├── folder_2
│ ├── folder_1
│ ├── folder_2
│ ├── folder_3
│ ├── folder_4
│ └── folder_5
├── folder_3
│ ├── folder_1
│ ├── folder_2
│ ├── folder_3
│ ├── folder_4
│ └── folder_5
├── folder_4
│ ├── folder_1
│ ├── folder_2
│ ├── folder_3
│ ├── folder_4
│ └── folder_5
└── folder_5
├── folder_1
├── folder_2
├── folder_3
├── folder_4
└── folder_5
在第五题创建的每一个文件夹下面都 创建第二题文本文件 me.txt ,内容也要一样。
zy@VM-0-15-ubuntu:~$ vim txt.sh
#!/bin/bash
#
for i in {1..5};do
cd ~/tmp/folder_$i
for I in {1..5};do
cd ~/tmp/folder_$i/folder_$I
echo -e:"Go to: http://www.biotrainee.com/ \n I love bioinfomatics.\n And you ?">me.txt
done
done
##按下Esc :wq
zy@VM-0-15-ubuntu:~$ tree tmp
tmp
├── folder_1
│ ├── folder_1
│ │ └── me.txt
│ ├── folder_2
│ │ └── me.txt
│ ├── folder_3
│ │ └── me.txt
│ ├── folder_4
│ │ └── me.txt
│ ├── folder_5
│ │ └── me.txt
│ └── me.txt
├── folder_2
│ ├── folder_1
│ │ └── me.txt
│ ├── folder_2
│ │ └── me.txt
│ ├── folder_3
│ │ └── me.txt
│ ├── folder_4
│ │ └── me.txt
│ ├── folder_5
│ │ └── me.txt
│ └── me.txt
├── folder_3
│ ├── folder_1
│ │ └── me.txt
│ ├── folder_2
│ │ └── me.txt
│ ├── folder_3
│ │ └── me.txt
│ ├── folder_4
│ │ └── me.txt
│ ├── folder_5
│ │ └── me.txt
│ └── me.txt
├── folder_4
│ ├── folder_1
│ │ └── me.txt
│ ├── folder_2
│ │ └── me.txt
│ ├── folder_3
│ │ └── me.txt
│ ├── folder_4
│ │ └── me.txt
│ ├── folder_5
│ │ └── me.txt
│ └── me.txt
└── folder_5
├── folder_1
│ └── me.txt
├── folder_2
│ └── me.txt
├── folder_3
│ └── me.txt
├── folder_4
│ └── me.txt
├── folder_5
│ └── me.txt
└── me.txt
#如何查看echo的帮助信息
/bin/echo --help
下载 http://www.biotrainee.com/jmzeng/igv/test.bed 文件,后在里面选择含有 H3K4me3 的那一行是第几行,该文件总共有几行。
zy@VM-0-15-ubuntu:~/tmp$ wget -c http://www.biotrainee.com/jmzeng/igv/test.bed
zy@VM-0-15-ubuntu:~/tmp$ grep -n H3K4me3 test.bed
8:chr1 9810 10438 ID=SRX387603;Name=H3K4me3%20(@%20HMLE);Title=GSM1280527:%20HMLE%20Twist3D%20H3K4me3%20rep2%3B%20Homo%20sapiens%3B%20ChIP-Seq;Cell%20group=Breast;
source_name=HMLE_Twist3D_H3K4me3;cell%20type=human%20mammary%20epithelial%20cells;transfected%20with=Twist1;culture%20type=sphere;chip%20antibody=H3K4me3;chip%20antibody%20vendor=Millipore; 222 . 9810 10438 0,226,255
zy@VM-0-15-ubuntu:~/tmp$ wc -l test.bed
10 test.bed
下载 http://www.biotrainee.com/jmzeng/rmDuplicate.zip 文件,并且解压,查看里面的文件夹结构
zy@VM-0-15-ubuntu:~/tmp$ wget -c http://www.biotrainee.com/jmzeng/rmDuplicate.zip
zy@VM-0-15-ubuntu:~/tmp$ unzip rmDuplicate.zip
zy@VM-0-15-ubuntu:~/tmp$ ls
rmDuplicate rmDuplicate.zip test.bed
zy@VM-0-15-ubuntu:~/tmp$ tree rmDuplicate
rmDuplicate
├── picard
│ ├── paired
│ │ ├── readme.txt
│ │ ├── tmp.header
│ │ ├── tmp.MarkDuplicates.log
│ │ ├── tmp.metrics
│ │ ├── tmp.rmdup.bai
│ │ ├── tmp.rmdup.bam
│ │ ├── tmp.sam
│ │ └── tmp.sorted.bam
│ └── single
│ ├── readme.txt
│ ├── tmp.header
│ ├── tmp.MarkDuplicates.log
│ ├── tmp.metrics
│ ├── tmp.rmdup.bai
│ ├── tmp.rmdup.bam
│ ├── tmp.sam
│ └── tmp.sorted.bam
└── samtools
├── paired
│ ├── readme.txt
│ ├── tmp.header
│ ├── tmp.rmdup.bam
│ ├── tmp.rmdup.vcf.gz
│ ├── tmp.sam
│ ├── tmp.sorted.bam
│ └── tmp.sorted.vcf.gz
└── single
├── readme.txt
├── tmp.header
├── tmp.rmdup.bam
├── tmp.rmdup.vcf.gz
├── tmp.sam
├── tmp.sorted.bam
└── tmp.sorted.vcf.gz
进入 rmDuplicate/samtools/single 文件夹里面,查看后缀为 .sam 的文件,搞清楚 生物信息学里面的SAM/BAM 定义是什么。
zy@VM-0-15-ubuntu:~$ cd ~/tmp/rmDuplicate/samtools/single
zy@VM-0-15-ubuntu:~/tmp/rmDuplicate/samtools/single$ less -S tmp.sam
# sam文件是序列比对后的标准文件。bam文件是sam的二进制文件。bam所占的内存比较小。sam包括头部注释信息和比对信息,比对信息每行12列,以tab健分割。每列含义:
打开 后缀为BAM 的文件,找到产生该文件的命令。
zy@VM-0-15-ubuntu:~/tmp/rmDuplicate/samtools/single$ samtools view tmp.rmdup.bam
zy@VM-0-15-ubuntu:~/tmp/rmDuplicate/samtools/single$ ls
readme.txt tmp.header tmp.rmdup.bam tmp.rmdup.vcf.gz tmp.sam tmp.sorted.bam tmp.sorted.vcf.gz
# 找到tmp.header查看
zy@VM-0-15-ubuntu:~/tmp/rmDuplicate/samtools/single$ tail -n 3 tmp.header
@SQ SN:chrY LN:57227415
@SQ SN:chrY_KI270740v1_random LN:37240
@PG ID:bowtie2 PN:bowtie2 VN:2.2.9 CL:"/home/jianmingzeng/biosoft/bowtie/bowtie2-2.2.9/bowtie2-align-s --wrapper basic-0 -p 20 -x /home/jianmingzeng/reference/index/bowtie/hg38 -S /home/jianmingzeng/data/public/allMouse/alignment/WT_rep2_Input.sam -U /tmp/41440.unp"
# 其中CL字样处就是要找的命令
根据上面的命令,找到我使用的参考基因组 /home/jianmingzeng/reference/index/bowtie/hg38 具体有多少条染色体。
# 头文件的 @SQ的注释信息就是参考基因组的注释信息。
zy@VM-0-15-ubuntu:~/tmp/rmDuplicate/samtools/single$ grep -o -E "SN:chr[0-9]+|SN:chr[a-z A-Z]+" tmp.header|sort |uniq
zy@VM-0-15-ubuntu:~/tmp/rmDuplicate/samtools/single$ grep -o -E "SN:chr[0-9]+|SN:chr[a-z A-Z]+" tmp.header|sort |uniq|wc -l
26
上面的后缀为BAM 的文件的第二列,只有 0 和 16 两个数字,用 cut/sort/uniq等命令统计它们的个数。
zy@VM-0-15-ubuntu:~/tmp/rmDuplicate/samtools/single$ samtools view tmp.rmdup.bam |cut -f 2 |sort -n |uniq -dc
16 0
12 16
重新打开 rmDuplicate/samtools/paired 文件夹下面的后缀为BAM 的文件,再次查看第二列,并且统计
zy@VM-0-15-ubuntu:~/tmp/rmDuplicate/samtools/paired$ samtools view tmp.rmdup.bam |cut -f 2 |sort -n |uniq -dc
2 83
2 97
8 99
7 147
2 163
下载 http://www.biotrainee.com/jmzeng/sickle/sickle-results.zip 文件,并且解压,查看里面的文件夹结构
zy@VM-0-15-ubuntu:~/tmp$ wget -c http://www.biotrainee.com/jmzeng/sickle/sickle-results.zip
zy@VM-0-15-ubuntu:~/tmp$ ls -lh
total 2.4M
drwxrwxr-x 4 zy zy 4.0K Nov 12 2016 rmDuplicate
-rw-rw-r-- 1 zy zy 103K Nov 12 2016 rmDuplicate.zip
-rw-rw-r-- 1 zy zy 2.3M Oct 6 2016 sickle-results.zip
-rw-rw-r-- 1 zy zy 3.1K May 18 2017 test.bed
解压 sickle-results/single_tmp_fastqc.zip 文件,并且进入解压后的文件夹,找到 fastqc_data.txt 文件,并且搜索该文本文件以 >>开头的有多少行?
zy@VM-0-15-ubuntu:~/tmp/sickle-results$ unzip single_tmp_fastqc.zip
zy@VM-0-15-ubuntu:~/tmp/sickle-results$ cd single_tmp_fastqc/
zy@VM-0-15-ubuntu:~/tmp/sickle-results/single_tmp_fastqc$ ls
fastqc_data.txt fastqc.fo fastqc_report.html Icons Images summary.txt
y@VM-0-15-ubuntu:~/tmp/sickle-results/single_tmp_fastqc$ grep ^\>\> fastqc_data.txt |cat -n
1 >>Basic Statistics pass
2 >>END_MODULE
3 >>Per base sequence quality pass
4 >>END_MODULE
5 >>Per tile sequence quality pass
6 >>END_MODULE
7 >>Per sequence quality scores pass
8 >>END_MODULE
9 >>Per base sequence content fail
10 >>END_MODULE
11 >>Per sequence GC content warn
12 >>END_MODULE
13 >>Per base N content pass
14 >>END_MODULE
15 >>Sequence Length Distribution warn
16 >>END_MODULE
17 >>Sequence Duplication Levels pass
18 >>END_MODULE
19 >>Overrepresented sequences warn
20 >>END_MODULE
21 >>Adapter Content pass
22 >>END_MODULE
23 >>Kmer Content warn
24 >>END_MODULE
下载 http://www.biotrainee.com/jmzeng/tmp/hg38.tss
文件,去NCBI找到TP53/BRCA1
等自己感兴趣的基因对应的 refseq数据库
ID,然后找到它们的hg38.tss
文件的哪一行。(https://www.ncbi.nlm.nih.gov/gene/7157)
点击NM_001126113.2跳转到新页面
zy@VM-0-15-ubuntu:~/tmp$ grep NM_001126113 hg38.tss
NM_001126113 chr17 7685550 7689550 1
解析hg38.tss 文件,统计每条染色体的基因个数。
zy@VM-0-15-ubuntu:~/tmp$ cat hg38.tss |head
NR_046018 chr1 9874 13874 0
NR_024540 chr1 27370 31370 1
NR_104148 chr7 64664083 64668083 0
NR_111960 chrX 44871175 44875175 0
NR_028458 chr14 92104621 92108621 1
NR_028459 chr14 92104621 92108621 1
NR_026818 chr1 34081 38081 1
NR_026820 chr1 34081 38081 1
NR_026822 chr1 34081 38081 1
NM_001005484 chr1 67091 71091 0
zy@VM-0-15-ubuntu:~/tmp$ cat hg38.tss |cut -f2 >chr.txt
zy@VM-0-15-ubuntu:~/tmp$ less chr.txt
wc -l chr.txt
wc -l chr2.txt
less chr2.txt
zy@VM-0-15-ubuntu:~/tmp$ sort chr2.txt |uniq -dc
6157 chr1
2838 chr10
3577 chr11
3014 chr12
1133 chr13
1982 chr14
2377 chr15
2696 chr16
3794 chr17
883 chr18
5880 chr19
4090 chr2
1692 chr20
895 chr21
1410 chr22
3395 chr3
2277 chr4
2821 chr5
5782 chr6
2785 chr7
2221 chr8
2310 chr9
2 chrM
32 chrUn
2561 chrX
414 chrY
解析hg38.tss 文件,统计NM和NR开头的序列,了解NM和NR开头的含义。
zy@VM-0-15-ubuntu:~/tmp$ grep -o -E "NM|NR" hg38.tss |sort |uniq -dc
51064 NM
15954 NR
# NM指的是转录组产物的序列; NR指的是非编码的转录子序列