个人学习R语言过程中的记录,仅供个人查阅。
参考链接:http://www.mobange.com/nav/other/85071.html
if(FALSE)
{
......
}
# 一个分支
if(条件) {
执行体
}
# 两个分支
if(条件) {
执行体1
} else {
执行体2
}
# 多分支例子
if(score >= 90) {
res = "优"
} else if(score >= 80) {
res = "良"
} else if(score >= 70) {
res = "中"
} else if(score >= 60) {
res = "及格"
} else {
res = "不及格"
}
next:结束本次循环;
break:跳出整个循环。
df$time <- as.Date(df$time) # 将df数据框中的 time 列转换成date格式
library(reshape2) # 提供函数 melt()
library(dplyer) # 提供函数 ename()
cormtcars <- round(cor(mtcars), 3) # 计算保留三位小数的相关性矩阵
data <- as.data.frame(cormtcars) %>% # 将矩阵转换成数据框
mutate(x=rownames(cormtcars)) %>% # 新建一列x,是11个变量名
melt(id='x') %>% # 将宽数据转换成长数数据
dplyer::rename('y'='variable','Corr'='value') # 将variable列名改成y
year <- substr(id_number$身份证号,7,10)
num = formatC(1, flag = 0, width = 2)
newdf <- round(cor(df), 2)
大写函数 toupper()函数
小写函数tolower()函数
首字母大写要调用Hmisc包的capitalize()函数
或用stringr包里的str_to_title()函数
save(var, file = "c:/var.Rdata")
write.csv(var, file = "c:/var.csv")
https://blog.csdn.net/m0_54356409/article/details/124201952
读取csv格式数据 read.csv()
读取xlsx格式数据
A <- c(1:3)
B <- c(5:10)
C <- c(A, B)
# 将数据框中df的name和age分别拼接在一起,作为一个新变量
newdf <- mutate(df, newvar = str_c(name, age, sep = '_'))
# 网上的例子(https://bbs.pinggu.org/forum.php?mod=viewthread&tid=6676000&page=1#pid53793584)
require('tidyverse')
A <- c("S1","S2","S3")
B <- c("AFG","ALB","DZA")
data_frame(A, B) %>%
expand(A, B) %>%
mutate(var = str_c(B, A, sep = '.')) %>%
pull(var)
参考链接
字段合并_1.3 R语言数据合并与追加
https://blog.csdn.net/weixin_39596720/article/details/112696799
## 两个数据框列名(主字段X)完全一致,具有相同列名时by=“X”可省略.
merge(df1,df2,by= “X”)
## 主字段不同名,取交集,两个数据某一主字段的部分行名称相同,将两个数据合并,保留行名相同的部分
merge(df1,df2,by.x="X",by.y= "Y")
## 在取交集的前提下保留前半部分特有数据
merge(df1,df2,by.x="X",,by.y="Y",all.x=TRUE)
## 只是在取交集的前提下保留后半部分特有数据
merge(df1,df2, by.x="X",by.y="Y", all.y=TRUE)
# 该链接方法保留两个数据的并集
merge(df1, df2, by.x="X",by.y="Y", all=TRUE)
默认是降序
sort(x, decreasing = FALSE, …)
library(dplyr)
arrange(data,ver1) # 依据ver1升序对data进行排序
arrange(data,ver1,ver2) # 依据ver1升序和ver2升序对data进行排序(针对ver1升序后重复的数据,依据ver2升序进行排序)
arrange(data, desc(ver1), ver2) #以及ver1降序,ver2升序对data排序
# 使用行列号删除行列
newdata <- data[-1, ] # 删除第一行
newdata <- data[ ,-1] # 删除第一列
newdata <- data[-c(1,2), ] # 删除1,2行
newdata <- data[, -c(1,2)] # 删除1,2列
df1 <- select(df,-3) # 删除第3列
df2<- select(df,-name) # 删除列名为name的列
df3<- select(df,-c(name,age)) #删除列名为name和age的列
library(dplyr)
newdata <- slice(data, -c(1,2)) # 删除1,2行
newdata <- subset(data, name != "wang") # 删除name等于wang的行
newdata <- filter(data, name != "wang") # 删除name等于wang的行
newdata <- filter(data, name != "wang" $ age != 25) #删除name等于wang的行并且age等于25的行
# 由于删除条件存在冲突,不能使用&符号一起处理,所以分别提取行号,再合并行号,根据行号删除行
rownum1 <- which(CO2$uptake < 10 & CO2$uptake > 5)
rownum2 <- which(CO2$uptake > 30)
rownum <- c(rownum1, rownum2)
newdata <- CO2[-rownum, ]
# 删除数据框中var向量有空值的行
newdata <- data[!is.na(data[ ,"var"]), ]
# 删除数据框中var向量小于10的行
newdata <- data[!data[data$var < 10, ], ]
newdata <- na.omit(data)
var_new <- unique(var)
var_new <- var[!duplicated(var), ]
dim(df) # 维度
dim(df)[1] # 行数
dim(df)[2] # 列数
nrow(df) # 行数
ncol(df) # 列数
str(df) # 打印数据框结构信息
head(df,5) # 打印数据框前五行数据
tail(df,5) # 打印数据框后五行数据
df$gender <- as.factor(df$gender)
str(df$gender) # 查看
df G e n d e r < − a s . f a c t o r ( d f 1 Gender <- as.factor(df1 Gender<−as.factor(df1Gender)
newdata <- rename(data, "newname" = oldname)
names(data)[1] <- "newname1"
names(data)[1:2] <- c("newname1","newname2")
colnames(data)[1] <- "newname1" # 只修改固定列的列名
colnames(data)[1:2] <- c("newname1","newname2")
colnames(data) <- c("newname1","newname2","newname3") # 默认只有三列
使用函数 ronames()
rownames(data)[1:3] = letters[1:3]
which(df$age == 27) # 返回符合条件行的下标
which(df$age == 27, 'age') # 返回符合符合条件的数值
which(df$age == 27 & df$height > 170, 'name')
which(df$age == 27 | df$height > 170, 'name')
which.min(df$age)
which.max(df$age)
subset(df, age == 27 & height > 170, c('Name', 'Age'))
# 注意下面方法要长度相同
df$age <- c(15,21,23,24,25)
df<- cbind(df, age = c(15,21,23,24,25))
df <- mutate(df, age = 0) # 添加一列0值,名称为'age',默认在最后一列
df <- select(df, age, everything()) # 将age列调整在第一列
data_col <- data[ ,1] # 提取第1列
data_row <- data[1, ] # 提取第1列
data_col <- data[ ,1:3] # 提取第1列到第3列
data_col <- data[ ,c(1,3)] # 提取第1列和第3列
data_var <- data[“var”] # 提取列名为var的向量
data_var <- data$var # 提取列名为var的向量
data_var <- data[data$var > 10, ] # 提取列向量var数值
# 此外多条件筛选条件可以使用符号与“&”(同时满足两种条件),或“|”(满足任一条件),等于“==”
data_var <- data[data$var1 > 10 & data$var2 < 20, ] # 提取同时满足两个条件的行数据
data_var <- data[data$var1 > 10 | data$var2 < 20, ] # 提取同时任一条件的行数据
data_var <- data[data$var1 == 1, ] #等号的使用
subset(airquality, Temp > 80, select = c(Ozone, Temp))
subset(airquality, Day == 1, select = -Temp)
subset(airquality, select = Ozone:Wind)
with(airquality, subset(Ozone, Temp > 80))
library(dplyr)
select(df,1)
select(df,var)
select(df,var1:var3)
## example with character variables and NAs
testDF <- data.frame(v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9),
v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99) )
by1 <- c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12)
by2 <- c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)
aggregate(x = testDF, by = list(by1, by2), FUN = "mean")
## Formulas, one ~ one, one ~ many, many ~ one, and many ~ many:
## ~符号左边是需要被平均的向量,右边是条件。
aggregate(weight ~ feed, data = chickwts, mean) # one ~ one
aggregate(breaks ~ wool + tension, data = warpbreaks, mean) # one ~ many
aggregate(cbind(Ozone, Temp) ~ Month, data = airquality, mean) # many ~ one
aggregate(cbind(ncases, ncontrols) ~ alcgp + tobgp, data = esoph, sum) # many ~ many
summarise 用于分组汇总
# 根据分组,求均值、标准差、标准误、个数
library(dplyr)
A <- group_by(date,class1,class2)
B <- summarise(A, mean = mean(var1), sd = sd(var1), se =sd(seed_setting_rate)/sqrt(n()), n = n())
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
使用creditmodel包的min_max_norm()函数
install.packages(creditmodel)
library(creditmodel)
min_max_norm(x)
i = 1
num = formatC(i, flag = 0, width = 2)
print(num)
data_new <- data[which(data$var), ]
library (showtext)
showtext_auto()
scale_x_continuous(limits = c(-5,15), # 设置最大和最小值
breaks = seq(0, 50, 1)) # 0-50每个1设置一个刻度
画横线:geom_hline(yintercept = 5 )
画竖线:geom_vline(xintercept = 20)
画斜线:geom_abline(intercept = 37, slope = -5)
path_s = "D:/Figure/"
ggsave(paste0(path_s, 'box', '.tiff'), Figure, width = 10, height = 7, units = c("cm"), dpi = 600)
ggplot2包中对配色系统主要通过以下几个函数实现:
参考链接:https://blog.csdn.net/qq_18055167/article/details/123512967
数值型
类别型
其中,colour表示轮廓色度量,fill表示填充色度量
————————————————————
调色板介绍
参考链接:https://blog.csdn.net/weshengxin/article/details/126006349
3 类调色板各有特色:
# 显示所有面板
display.brewer.all(n=NULL, type="all", select=NULL, exact.n=TRUE, colorblindFriendly=FALSE)
————————————————————
变量的类型
参考链接:https://blog.csdn.net/Allenmumu/article/details/119532449
————————————————————
(1)调整单一颜色
(2)分类变量颜色调整
(3)数值变量颜色调整
定量变量表示可测量的数量,因此是数字变量。定量数据可以进一步分类为连续(可能是浮点数)或离散(仅限整数)。
函数 scale_color_gradient() 是一个顺序梯度,
而 cale_color_gradient2() 是发散的。
连续变量默认配色方案,单色渐变,
scale_color_continuous()
手动设置顺序配色方案,
scale_color_gradient(low = “white”, high = “black”)
发散的配色方案,
scale_color_gradient2(low = “red”, mid = “white”, high = “blue”)
使用R预设调色板,
scale_color_gradientn(colours =rainbow(10))
将ColorBrewer的颜色应用到连续变量上。
scale_color_distiller(palette = “Spectral”)
ggplot2中的 Viridis 调色板(好看,比较实用)
scale_color_viridis_c()
scale_color_viridis_c(option = “inferno”)
scale_color_viridis_c(option = “plasma”)
scale_color_viridis_c(option = “cividis”)
使用扩展包中的调色板
racrtocolors包
scale_color_carto_c(palette = “BurgYl”)
scale_color_carto_c(palette = “Earth”)
参考博文:
# 使用geom_histogram自定义
P <- ggplot(data, aes(var))
P + geom_histogram(fill = "darkgrey",
color="white") +
scale_x_continuous(breaks = seq(0, 50, 1))
# 让每个bin的两侧都显示标注
P + geom_bar() + scale_x_binned()
参考博文:
https://www.zhangshengrong.com/p/ArXGbnrENj/
构建了新函数summarySE,计算平均值和标准差,添加了误差棒
https://zhuanlan.zhihu.com/p/38412409
添加了文本标注
D.se <- summarySE(D, measurevar = 'seed_setting_rate', groupvars=c('genotype', 'treatment'), na.rm = TRUE)
P <- ggplot(D.se , aes(x = genotype, y = seed_setting_rate, fill = treatment)) +
geom_errorbar(aes(ymax = seed_setting_rate + se, ymin = seed_setting_rate - se),
position = position_dodge(0.9), width = .8, size = .2) +
geom_bar(stat="identity", position="dodge") +
mytheme
P
path = "G:/F/work_RuGao_WYL_2022/Phenotype/Figure/maturity/"
ggsave(paste0(path,'bar_','seed_setting_rate','.tiff'), P, width = 20, height = 10,units = c("cm"), dpi = 600)
library(ggplot2)
data(mtcars)
df <- mtcars
df$cyl <- as.factor(df$cyl)
ggscatter(df, x = "wt", y = "mpg",
add = "reg.line", conf.int = TRUE,
add.params = list(fill = "lightgray"),
ggtheme = theme_minimal()
)+ stat_cor(method = "pearson",
label.x = 2, label.y = 30,color='red')
参考链接:
参考链接:
R语言绘制相关性热图(相关性图)多种方法盘点,包你一学就会!( https://zhuanlan.zhihu.com/p/458889477 )
library(ggcorrplot)
library(ggplot2)
# 使用默认的方案绘图
ggcorrplot(cormtcars)
# 使用自定义方案绘图
cor <- round(cor(mtcars), 2) # R自带的cor()函数计算相关性矩阵,round()保留两位小数
pmtcars <- cor_pmat(mtcars) # 使用ggcorrplot包的cor_pmat()函数计算p值
P <- ggcorrplot(cor,
method = "circle", # 使用圆形表示每个相关性,默认是方形
type = "upper", # 只显示上三角
lab = T, lab_size = 3, # 显示相关性标注,调整字体大小
p.mat = pmat, insig = "blank", # pmat表示前面计算的显著性P的矩阵, "blank"表示显著性小于0.05的相关性不显示,默认是打叉号
# hc.order = TRUE # 分等级聚类重排矩阵,通俗讲就是把相关性按照高低或者正负堆在一块
)
P
# 保存图片
path_s = "D:/"
ggsave(paste0(path_s,'cor', '.tiff'), P, width = 20, height = 20, units = c("cm"), dpi = 600)
还可以使用以下包:
# 使用箱线图计算离群值的算法
df <- D_PE_date
D_PE_outlier <- df[0, ] #构建空数据框,用于汇总输出结果
for(i in 1:110){
print(i)
D.i <- subset(df, genotype_num == i)
for(j in parameter_abbr){
D.ij <- D.i[,j]
qnt <- quantile(D.ij, probs = c(.25, .75), na.rm = T, names = FALSE) # 计算分位点
H <- 1.5 * IQR(D.ij, na.rm = T) # 计算四分位范围 # outlier_down <- qnt[1] - H; outlier_up <- qnt[2] + H; qnt; H; outlier_down; outlier_up; outlier <- D.ij[D.ij < (qnt[1] - H) | D.ij > (qnt[2] + H)]
D.i[D.i[,j] < qnt[1] - H | D.i[,j] > qnt[2] + H, j] <- NA
}
D_PE_outlier <- rbind(D_PE_outlier, D.i)
}