我喜欢的配色
常规作图
一、移动目录下所有文件
find . -name "*.bam" -type f -exec mv {} ~/ChIP_seq/bam \;
find . -name "*.bai" -type f -exec mv {} ~/ChIP_seq/bam \;
find . -name "*.fastq.gz" -type f -exec mv {} ~/ChIP_seq/fastq \;
二、批量重命名
#去除_all_indel.bed之后的
ls *_all_indel.bed | while read id ;
do
mv $id `echo ${id%_all_indel.bed*}_all_indel.vcf`
done
#去除_之前的
ls *fastq.gz | while read id ;
do
mv $id `echo ${id#*_}`
done
#fq改成fastq
ls *.fq.gz | while read id ;
do
mv $id `echo ${id%.fq.gz*}.fastq.gz`
done
#Delete some files
for((i=1;i<=24;i++));
do
n="YM-${i}_S${i}*.gz"
rm $n
done
三、awk筛选提取大文件
替换fasta header
awk '{ if ($0~/^>/) { n=split($0, a, "|"); gsub(/_/," ", a[1]); printf("%s|%s\n", a[1], substr(a[2], 2)); } else { print $0; } }' ${mouse_fa_dir}/mouse.lncRNA.fa>${mouse_fa_dir}/mouse.lncRNA.renmae.fa
提取特定的reads,并替换bam文件的header
samtools view -h -@ ${thread} ${dir}/${i} | awk 'BEGIN{FS=OFS=\"\t\"} (/^@/ && !/@SQ/){print \$0} \$2~/^SN:mouse_chr[1-9]|^SN:mouse_chrX|^SN:mouse_chrY|^SN:mouse_chrM/{print \$0} \$3~/^mouse_chr[1-9]|X|Y|M/{print \$0} ' | sed 's/mouse_chr/chr/g' | samtools view -bS - >${mouse_dir}/${i%.bam*}.mouse.bam
samtools view -h -@ ${thread} ${dir}/${i} | awk 'BEGIN{FS=OFS=\"\t\"} (/^@/ && !/@SQ/){print \$0} \$2~/^SN:human_chr[1-9]|^SN:human_chrX|^SN:human_chrY|^SN:human_chrM/{print \$0} \$3~/^human_chr[1-9]|X|Y|M/{print \$0} ' | sed 's/human_chr/chr/g' | samtools view -bS - >${human_dir}/${i%.bam*}.human.bam
四、wget 用法
你应该了解的所有wget命令
五、压缩与解压缩
压缩与解压缩
六、文件夹大小
Linux查看文件和文件夹大小
du -h --max-depth=1
七、 统计文件数目
统计文件数目2
ls -l | grep "^-" | wc -l #文件数
wc - lcw file1 #文件行数
八、csv文件转换
删除quote
ml csvkit
csvformat -T human_pc_cor.csv > human_pc_cor.csv
9、linux for 循环
例子1
for i in Fed1 Fed2 Fed3 Fasting5 Fasting6 Fasting7
do
echo $(cat ${data_dir}/${i}/${i}.m6A_output.bed | sort -k1,1 -k2,2n >${out_dir}/${i}.m6a.sorted.bed)
done
例子2
for i in {1..16..2}
do
j=`expr ${i} + 1`
job_file="${job_dir}/S${i}_S${j}.footprinting_diff.job"
echo "#!/bin/bash
#SBATCH --job-name=S${i}_S${j}.footprinting_diff
#SBATCH --output=${log_dir}/S${i}_S${j}.footprinting_diff.out
#SBATCH --time=10:00:00
#SBATCH --gres=lscratch:20
#SBATCH --cpus-per-task=${threads}
#SBATCH --mem=100g
module load rgt
diff_out_dir_sample=$diff_out_dir/S${i}_S${j}
mkdir -p \${diff_out_dir_sample}
rgt-hint differential --organism=mm10 --bc --nc 30 --mpbs-files=${match_out_dir}/CJ5439_S${i}_mpbs.bed,${match_out_dir}/CJ5439_S${j}_mpbs.bed \
--reads-files=${bam_dir}/CJ5439_S${i}.sort.bam,${bam_dir}/CJ5439_S${j}.sort.bam --conditions=CJ5439_S${i},CJ5439_S${j} \
--output-location=\${diff_out_dir_sample} --output-prefix=S${i}_S${j}
" > $job_file
sbatch $job_file
done
各类文件转换
R语言常用
R语言使用的一些技巧
R语言数据科学
Bugs
1.org.Hs.eg.db
Error: package or namespace load failed for ‘org.Hs.eg.db’:
.onLoad failed in loadNamespace() for 'org.Hs.eg.db', details:
call: l$contains
error: $ operator is invalid for atomic vectors
加上options(connectionObserver = NULL)就ok
Error in UseMethod("select") :
no applicable method for 'select' applied to an object of class "c('OrgDb', 'AnnotationDb', 'envRefClass', '.environment', 'refClass', 'environment', 'refObject', 'AssayData')"
使用:
AnnotationDbi::select(org.Hs.eg.db, keys=as.character(human_fraction_df_sig_genes), columns=c("SYMBOL","ENTREZID"), keytype="SYMBOL")
rm(list = ls())
options(stringsAsFactors=F)
options(scipen = 200)#取消科学记数法
#函数变量转化
get()#返回与字符串同名的变量的值
assign()#为字符串变量的字符串赋值 eg: assign(x_name, read.table(file_name))
substitute()#将变量名转化为同名字符串
#替换
gsub("Efg", "AAA", text) #将Efg改为AAA,区分大小写
#首字母大写
suppressMessages(library(stringr))
str_to_title(your_data, locale = "")
#删除开头和结尾两个字符
gsub('^..|..$', '', x)
1、获取目录下文件
mytsvfile = list.files(pattern="*.tsv")
list2env(
lapply(setNames(mytsvfile, make.names(gsub("*.tsv$", "", mytsvfile))),
read.table,header=T,row.names=1,check.names=FALSE,skip = 1), envir = .GlobalEnv)
files<-unlist(lapply(mytsvfile, FUN = function(x) {return(strsplit(x, split = ".tsv",fixed = T)[[1]][1])}))
2、随机取数
combn()
3、批量运算内置文件输出时,input为数字(等待优化)
library(parallel)
cl <- makeCluster(40)
results<-parSapply(cl,1:length(com_list),poly_met)
stopCluster(cl)
message("Finish")
4、分割
human_overlap_lnc<-unlist(lapply(genelist, FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
unlist(strsplit(gene,"\\|",))[1]
5、替换和截取字符
human_rnaseq_pick$sample<-as.character(gsub('Fed', 'AL', as.character(human_rnaseq_pick$sample), fixed=F))
human_rnaseq_pick$group<-as.character(substr(as.character(human_rnaseq_pick$sample),1,nchar(as.character(human_rnaseq_pick$sample))-1))
5、ggplot2 相关
ggplot2——坐标系篇
截断
#1、坐标轴标签分割及旋转,使用scales包进行分割
suppressMessages(library(scales))
p<-ggplot(data=yourdata,aes(x = x, y = y))
p<-p+scale_y_discrete(labels = wrap_format(50))#Y轴五十个字符分割
#2、设置横坐标字体大小、颜色、旋转等
p<-p+theme(axis.text.x = element_text(size = 15, family = "myFont", color = "green", face = "bold", vjust = 1, hjust = 1, angle = 45))
p<-p+theme(axis.text.x = element_text( vjust = 1, hjust = 1, angle = 45))
#3、去除背景,设置空白背景
p<-p+theme_bw()+theme_classic()#设置空白背景
#4、保存
filename=paste0(your_dir,"/signature.pdf")
ggsave(filename,p,units = "in",width=6,height = 4.5,device = "pdf")
#5、去除x轴刻度
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
#7、坐标轴截断
human_new_df$"type"<-factor(human_new_df$type, levels=unique(human_new_df$type))
human_new_df$"treatment"<-factor(human_new_df$treatment, levels=unique(human_new_df$treatment))
p1<-ggboxplot(human_new_df, x = "type", y = "polya_length",
palette =c("#4DBBD5FF","#E64B35FF"),fill = "treatment",legend = "none",
ylab = "Poly(A) length")+coord_cartesian(ylim = c(0,160))+
theme(axis.text.x = element_text( vjust = 1, hjust = 1, angle = 45))
p2<-ggboxplot(human_new_df, x = "type", y = "polya_length",
palette =c("#4DBBD5FF","#E64B35FF"),fill = "treatment")+
labs(x=NULL,y=NULL,fill=NULL) +
theme(axis.text.x = element_blank(),axis.ticks.x = element_blank(),axis.line.x= element_blank())+
coord_cartesian(ylim = c(600,800))+scale_y_continuous(breaks = c(600,800,100))
ggarrange(p2,p1,heights=c(1/8, 7/8),ncol = 1, nrow = 2,common.legend = TRUE,legend="top",align = "v")
6、ggpubr 相关问题
固定组别顺序,使出的图按照固定顺序排列
#固定group顺序
signature$Group <- factor(signature$Group, levels=unique(signature$Group))
ggbarplot组别分开
ggpubr加p
ggpubr 统计相关问题
有关两组之间大比较如何加星号
注意图中加深部位不能有双引号,特别在批量定义函数时候,要用原始值不能用函数
我的例子
例子1
new_data<-new_ccle_data %>% gather(key=Gene_name, value=expression_levels,-pick_levels)
colnames(new_data)<-c("Levels","Gene_name","Relative_expression_levels")
filename=paste0(data_dir,"/",j,"_expression_levels_in_",i,"_Median.pdf")
pdf(filename,width=3.5,height=5)
p <- ggboxplot(new_data, x = "Gene_name", y = "Relative_expression_levels",
color = "Levels", palette ="jco",
shape = "Levels",group="Levels")
p<-p + stat_compare_means(aes(group = Levels),label = "p.signif",hide.ns=F, paired=F,method = "t.test")
plot(p)
dev.off()
#例子2
data<-degs %>% left_join(pcr,by="New_name")
data<-data[complete.cases(data$Relative_Exp),]
p <- ggbarplot(data, x = "New_name", y = "Relative_Exp",
color = "Treatment", palette = c("#4DBBD5","#E64B35"),
ylab = "Relative Expression Levels",position = position_dodge(0.9),
fill = "Treatment",alpha=0.2,add = c("mean_se", "jitter"),add.params = list(size=0.5))
p<-p + stat_compare_means(aes(group = Treatment),label = "p.signif",hide.ns=F, paired=F,method = "t.test")
p<-p+theme(axis.text.x = element_text(size = 7, vjust = 0.7, hjust =0.7, angle = 45))
8、tidyverse使用
R语言长短数据转换
suppressMessages(library(tidyverse))
#1.筛选
rt<-df %>% filter(pvalue<0.05)
#2.长变短
rt<-df %>% gather(key="New_name",value="New_name",-不变的列)
#3.短变长
rt<-df %>% spread(key="old_name",value="old_name",-不变的列)
#4.合并
combind<-to_df %>%
left_join(df, by="pathway")
#5. 逐级排序
out_tab<-arrange(out_tab,desc(year),desc(month),desc(day))
arrange(out_tab,match(year, c("C","A","B")), desc(Res), desc(Pop))#指定顺序排序
#6.管道统计作图
data %>%
group_by(seurat_clusters,orig.ident) %>%
count() %>%
group_by(seurat_clusters) %>%
mutate(percent=100*n/sum(n)) %>%
ungroup() %>%
ggbarplot(x="seurat_clusters",y="percent", fill="orig.ident",color = "black",palette = "jco")
9、获取GO Term的子孙后代
go_term<-read.csv("~/GO_BP_superfamily.csv",header = F)
suppressMessages(library("plyr"))
suppressMessages(library(GO.db))
list <- list()
for (i in 1:nrow(go_term)){
rt<-as.list(GOBPOFFSPRING[as.character(go_term[i,2])])
rrt<-as.matrix(rt[[1]])
colnames(rrt)<-as.character(go_term[i,1])
list[[i]] <- data.frame(rrt)
}
df<-rbind.fill(list)
write.csv(df,"~/GO_BP_superfamily_Offspring.csv")
10、返回最大值列名
参考
参考2
library(tidyverse)
library(purrrlyr)
row_handler <- function(row.data){
index <- which(row.data == max(row.data)) # 找出最大的元素的index
out <- names(row.data[index]) %>% # 从index还原成列名
str_c(collapse = ",") # 拼接
return(out)
}
major_path<-nafld_new_df %>%
by_row(..f = row_handler, .collate = "rows", .to = "major_pathway")
11、批量设组
library(Hmisc)
group <- factor(gsub("(fed|fasting).*", "\\1", as.character(rt$Sample)), levels = c("fed", "fasting")))
treatment<-as.character(group)
data_pick$"Treatment"<-capitalize(treatment)
12、提取两字符间文字
如何在R中的两个字符之间提取文本
笨方法分割两次,比如说我要提取>和空格之间的数据,分两步切割,第一次提取空格之前的,第二部提取>之后的,写个函数就行
get_list<-function(rt){
new_rt<-unlist(lapply(as.character(rt[,1]), FUN = function(x) {return(strsplit(x, split = " ",fixed = T)[[1]][1])}))
new_rt<-unlist(lapply(as.character(new_rt), FUN = function(x) {return(strsplit(x, split = ">",fixed = T)[[1]][2])}))
return(new_rt)
}
13、去除NA值
# 除了使用na.omit()以外
rna_deg<-rna_deg[complete.cases(rna_deg[,1:5]),]
14、统计方法
Mann-Whitney
15、overlap不使用intersect
intersect 做overlap会去掉重复值所以可以用:
b[is.element(b,a)#找出b中与a overlap的元素
get_list<-function(a,b){b[is.element(b,a)]}
16、返回ENTREZID
suppressMessages(library(clusterProfiler))
suppressMessages(library(tidyverse))
suppressMessages(library(org.Hs.eg.db))
suppressMessages(library(scales))
get_gene_list<-function(list){
genes<-as.character(list)
genes<-genes[!duplicated(genes)]
gene_list<-select(org.Hs.eg.db, keys=genes, columns=c("SYMBOL","ENTREZID"), keytype="SYMBOL")
gene_list<-gene_list[!duplicated(gene_list$SYMBOL),]
return(na.omit(as.character(gene_list$ENTREZID)))
}
Snakemake注意点
1.转义问题