生信常用基础语法及技巧

我喜欢的配色
常规作图
一、移动目录下所有文件

find . -name "*.bam" -type f -exec mv {} ~/ChIP_seq/bam \; 
find . -name "*.bai" -type f -exec mv {} ~/ChIP_seq/bam \; 
find . -name "*.fastq.gz" -type f -exec mv {} ~/ChIP_seq/fastq \; 

二、批量重命名

#去除_all_indel.bed之后的
ls *_all_indel.bed | while read id ;
do
mv $id `echo ${id%_all_indel.bed*}_all_indel.vcf`
done
#去除_之前的
ls *fastq.gz | while read id ;
do
mv $id `echo ${id#*_}`
done
#fq改成fastq
ls *.fq.gz | while read id ;
do
mv $id `echo ${id%.fq.gz*}.fastq.gz`
done

#Delete some files

for((i=1;i<=24;i++));  
do   
n="YM-${i}_S${i}*.gz"
rm $n
done  

三、awk筛选提取大文件
替换fasta header

awk '{ if ($0~/^>/) { n=split($0, a, "|"); gsub(/_/," ", a[1]); printf("%s|%s\n", a[1], substr(a[2], 2)); } else { print $0; } }' ${mouse_fa_dir}/mouse.lncRNA.fa>${mouse_fa_dir}/mouse.lncRNA.renmae.fa

提取特定的reads,并替换bam文件的header

samtools view -h -@ ${thread} ${dir}/${i} | awk 'BEGIN{FS=OFS=\"\t\"} (/^@/ && !/@SQ/){print \$0} \$2~/^SN:mouse_chr[1-9]|^SN:mouse_chrX|^SN:mouse_chrY|^SN:mouse_chrM/{print \$0}  \$3~/^mouse_chr[1-9]|X|Y|M/{print \$0} ' | sed 's/mouse_chr/chr/g' | samtools view -bS - >${mouse_dir}/${i%.bam*}.mouse.bam
samtools view -h -@ ${thread} ${dir}/${i} | awk 'BEGIN{FS=OFS=\"\t\"} (/^@/ && !/@SQ/){print \$0} \$2~/^SN:human_chr[1-9]|^SN:human_chrX|^SN:human_chrY|^SN:human_chrM/{print \$0}  \$3~/^human_chr[1-9]|X|Y|M/{print \$0} ' | sed 's/human_chr/chr/g' | samtools view -bS - >${human_dir}/${i%.bam*}.human.bam

四、wget 用法
你应该了解的所有wget命令
五、压缩与解压缩
压缩与解压缩
六、文件夹大小
Linux查看文件和文件夹大小

du -h --max-depth=1

七、 统计文件数目
统计文件数目2

ls -l | grep "^-" | wc -l #文件数
wc - lcw file1 #文件行数

八、csv文件转换
删除quote

ml csvkit
csvformat -T human_pc_cor.csv > human_pc_cor.csv

9、linux for 循环

例子1
for i in Fed1 Fed2 Fed3 Fasting5 Fasting6 Fasting7
do
echo $(cat ${data_dir}/${i}/${i}.m6A_output.bed | sort -k1,1 -k2,2n  >${out_dir}/${i}.m6a.sorted.bed)
done

例子2
for i in {1..16..2} 
do
j=`expr ${i} + 1`
job_file="${job_dir}/S${i}_S${j}.footprinting_diff.job"

    echo "#!/bin/bash
#SBATCH --job-name=S${i}_S${j}.footprinting_diff
#SBATCH --output=${log_dir}/S${i}_S${j}.footprinting_diff.out
#SBATCH --time=10:00:00
#SBATCH --gres=lscratch:20
#SBATCH --cpus-per-task=${threads}
#SBATCH --mem=100g
module load rgt
diff_out_dir_sample=$diff_out_dir/S${i}_S${j}
mkdir -p \${diff_out_dir_sample}
rgt-hint differential --organism=mm10 --bc --nc 30 --mpbs-files=${match_out_dir}/CJ5439_S${i}_mpbs.bed,${match_out_dir}/CJ5439_S${j}_mpbs.bed \
--reads-files=${bam_dir}/CJ5439_S${i}.sort.bam,${bam_dir}/CJ5439_S${j}.sort.bam --conditions=CJ5439_S${i},CJ5439_S${j} \
--output-location=\${diff_out_dir_sample} --output-prefix=S${i}_S${j}
" > $job_file
sbatch $job_file
done

各类文件转换

R语言常用

R语言使用的一些技巧
R语言数据科学

Bugs

1.org.Hs.eg.db
Error: package or namespace load failed for ‘org.Hs.eg.db’:
 .onLoad failed in loadNamespace() for 'org.Hs.eg.db', details:
  call: l$contains
  error: $ operator is invalid for atomic vectors

加上options(connectionObserver = NULL)就ok

Error in UseMethod("select") : 
  no applicable method for 'select' applied to an object of class "c('OrgDb', 'AnnotationDb', 'envRefClass', '.environment', 'refClass', 'environment', 'refObject', 'AssayData')"
使用:
AnnotationDbi::select(org.Hs.eg.db, keys=as.character(human_fraction_df_sig_genes), columns=c("SYMBOL","ENTREZID"), keytype="SYMBOL") 

rm(list = ls())
options(stringsAsFactors=F)
options(scipen = 200)#取消科学记数法

#函数变量转化
get()#返回与字符串同名的变量的值
assign()#为字符串变量的字符串赋值 eg: assign(x_name, read.table(file_name))
substitute()#将变量名转化为同名字符串
#替换
gsub("Efg", "AAA", text) #将Efg改为AAA,区分大小写

#首字母大写
suppressMessages(library(stringr))
str_to_title(your_data, locale = "")
#删除开头和结尾两个字符
gsub('^..|..$', '', x)

1、获取目录下文件

mytsvfile = list.files(pattern="*.tsv")   
list2env(
 lapply(setNames(mytsvfile, make.names(gsub("*.tsv$", "", mytsvfile))),
        read.table,header=T,row.names=1,check.names=FALSE,skip = 1), envir = .GlobalEnv)
files<-unlist(lapply(mytsvfile, FUN = function(x) {return(strsplit(x, split = ".tsv",fixed = T)[[1]][1])}))

2、随机取数

combn()

3、批量运算内置文件输出时,input为数字(等待优化)

library(parallel)
cl <- makeCluster(40)
results<-parSapply(cl,1:length(com_list),poly_met)
stopCluster(cl)
message("Finish")

4、分割

human_overlap_lnc<-unlist(lapply(genelist, FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
unlist(strsplit(gene,"\\|",))[1]

5、替换和截取字符

human_rnaseq_pick$sample<-as.character(gsub('Fed', 'AL', as.character(human_rnaseq_pick$sample), fixed=F))
human_rnaseq_pick$group<-as.character(substr(as.character(human_rnaseq_pick$sample),1,nchar(as.character(human_rnaseq_pick$sample))-1))


5、ggplot2 相关
ggplot2——坐标系篇
截断

#1、坐标轴标签分割及旋转,使用scales包进行分割
suppressMessages(library(scales))
p<-ggplot(data=yourdata,aes(x = x, y = y))
p<-p+scale_y_discrete(labels = wrap_format(50))#Y轴五十个字符分割

#2、设置横坐标字体大小、颜色、旋转等
p<-p+theme(axis.text.x = element_text(size = 15, family = "myFont", color = "green", face = "bold", vjust = 1, hjust = 1, angle = 45))
p<-p+theme(axis.text.x = element_text( vjust = 1, hjust = 1, angle = 45))
#3、去除背景,设置空白背景
p<-p+theme_bw()+theme_classic()#设置空白背景

#4、保存
filename=paste0(your_dir,"/signature.pdf")
ggsave(filename,p,units = "in",width=6,height = 4.5,device = "pdf")

#5、去除x轴刻度
theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
#7、坐标轴截断
human_new_df$"type"<-factor(human_new_df$type, levels=unique(human_new_df$type))
human_new_df$"treatment"<-factor(human_new_df$treatment, levels=unique(human_new_df$treatment))
p1<-ggboxplot(human_new_df,  x = "type", y = "polya_length",
             palette =c("#4DBBD5FF","#E64B35FF"),fill = "treatment",legend = "none",
             ylab = "Poly(A) length")+coord_cartesian(ylim = c(0,160))+
            theme(axis.text.x = element_text( vjust = 1, hjust = 1, angle = 45))
p2<-ggboxplot(human_new_df,  x = "type", y = "polya_length",
              palette =c("#4DBBD5FF","#E64B35FF"),fill = "treatment")+
              labs(x=NULL,y=NULL,fill=NULL) +
              theme(axis.text.x = element_blank(),axis.ticks.x = element_blank(),axis.line.x= element_blank())+
              coord_cartesian(ylim = c(600,800))+scale_y_continuous(breaks = c(600,800,100))  
ggarrange(p2,p1,heights=c(1/8, 7/8),ncol = 1, nrow = 2,common.legend = TRUE,legend="top",align = "v") 

6、ggpubr 相关问题
固定组别顺序,使出的图按照固定顺序排列

#固定group顺序
signature$Group <- factor(signature$Group, levels=unique(signature$Group))

ggbarplot组别分开

ggpubr加p

ggpubr 统计相关问题
有关两组之间大比较如何加星号

生信常用基础语法及技巧_第1张图片
image.png

注意图中加深部位不能有双引号,特别在批量定义函数时候,要用原始值不能用函数
我的例子

例子1
  new_data<-new_ccle_data %>% gather(key=Gene_name, value=expression_levels,-pick_levels)
  colnames(new_data)<-c("Levels","Gene_name","Relative_expression_levels")
    filename=paste0(data_dir,"/",j,"_expression_levels_in_",i,"_Median.pdf")
    pdf(filename,width=3.5,height=5)
    p <- ggboxplot(new_data, x = "Gene_name", y = "Relative_expression_levels",
                   color = "Levels", palette ="jco",
                    shape = "Levels",group="Levels")
    p<-p + stat_compare_means(aes(group = Levels),label = "p.signif",hide.ns=F, paired=F,method = "t.test")

    plot(p)
    dev.off() 

#例子2
data<-degs %>% left_join(pcr,by="New_name")
data<-data[complete.cases(data$Relative_Exp),]
p <- ggbarplot(data, x = "New_name", y = "Relative_Exp",
               color = "Treatment", palette = c("#4DBBD5","#E64B35"),
               ylab = "Relative Expression Levels",position = position_dodge(0.9),
               fill = "Treatment",alpha=0.2,add = c("mean_se", "jitter"),add.params = list(size=0.5))
p<-p + stat_compare_means(aes(group = Treatment),label = "p.signif",hide.ns=F, paired=F,method = "t.test")
p<-p+theme(axis.text.x = element_text(size = 7,  vjust = 0.7, hjust =0.7, angle = 45))

8、tidyverse使用
R语言长短数据转换

suppressMessages(library(tidyverse))

 #1.筛选
rt<-df %>% filter(pvalue<0.05)

#2.长变短
rt<-df %>% gather(key="New_name",value="New_name",-不变的列)

#3.短变长

rt<-df %>% spread(key="old_name",value="old_name",-不变的列)

#4.合并

combind<-to_df %>% 
  left_join(df, by="pathway")

#5. 逐级排序
out_tab<-arrange(out_tab,desc(year),desc(month),desc(day))
arrange(out_tab,match(year, c("C","A","B")), desc(Res), desc(Pop))#指定顺序排序
#6.管道统计作图
data %>%

  group_by(seurat_clusters,orig.ident) %>%

  count() %>%

  group_by(seurat_clusters) %>%

  mutate(percent=100*n/sum(n)) %>%

  ungroup() %>%

  ggbarplot(x="seurat_clusters",y="percent", fill="orig.ident",color = "black",palette = "jco")

9、获取GO Term的子孙后代

go_term<-read.csv("~/GO_BP_superfamily.csv",header = F)
suppressMessages(library("plyr"))
suppressMessages(library(GO.db))
list <- list()
for (i in 1:nrow(go_term)){
  rt<-as.list(GOBPOFFSPRING[as.character(go_term[i,2])])
  rrt<-as.matrix(rt[[1]])
  colnames(rrt)<-as.character(go_term[i,1])
  list[[i]] <- data.frame(rrt)
}
df<-rbind.fill(list)
write.csv(df,"~/GO_BP_superfamily_Offspring.csv")

10、返回最大值列名
参考
参考2

library(tidyverse)
library(purrrlyr)

row_handler <- function(row.data){  
  index <- which(row.data == max(row.data))  # 找出最大的元素的index
  out <- names(row.data[index]) %>% # 从index还原成列名
    str_c(collapse = ",") # 拼接
  return(out)
}
major_path<-nafld_new_df %>%
  by_row(..f = row_handler, .collate = "rows", .to = "major_pathway")

11、批量设组

library(Hmisc)
group <- factor(gsub("(fed|fasting).*", "\\1", as.character(rt$Sample)), levels = c("fed", "fasting")))
treatment<-as.character(group)
data_pick$"Treatment"<-capitalize(treatment)

12、提取两字符间文字
如何在R中的两个字符之间提取文本
笨方法分割两次,比如说我要提取>和空格之间的数据,分两步切割,第一次提取空格之前的,第二部提取>之后的,写个函数就行

get_list<-function(rt){
  new_rt<-unlist(lapply(as.character(rt[,1]), FUN = function(x) {return(strsplit(x, split = " ",fixed = T)[[1]][1])}))
  new_rt<-unlist(lapply(as.character(new_rt), FUN = function(x) {return(strsplit(x, split = ">",fixed = T)[[1]][2])}))
  return(new_rt)
}

13、去除NA值

# 除了使用na.omit()以外
rna_deg<-rna_deg[complete.cases(rna_deg[,1:5]),]

14、统计方法
Mann-Whitney

15、overlap不使用intersect
intersect 做overlap会去掉重复值所以可以用:

b[is.element(b,a)#找出b中与a overlap的元素
get_list<-function(a,b){b[is.element(b,a)]}

16、返回ENTREZID
suppressMessages(library(clusterProfiler))
suppressMessages(library(tidyverse))
suppressMessages(library(org.Hs.eg.db))
suppressMessages(library(scales))
get_gene_list<-function(list){
  genes<-as.character(list)
  genes<-genes[!duplicated(genes)]
  gene_list<-select(org.Hs.eg.db, keys=genes, columns=c("SYMBOL","ENTREZID"), keytype="SYMBOL")
  gene_list<-gene_list[!duplicated(gene_list$SYMBOL),]
  return(na.omit(as.character(gene_list$ENTREZID)))
}

Snakemake注意点
1.转义问题

生信常用基础语法及技巧_第2张图片
转义字符

你可能感兴趣的:(生信常用基础语法及技巧)