目录
一.dplyr包
1.新增变量和变量重新赋值
2.筛选行
3.筛选列
4.分组计算
5.管道操作符
6.连接数据框
二.tidyr
1.列的分裂
2.列的合并
3.宽数据转长数据
4.长数据转宽数据
> head(ToothGrowth)
len supp dose
1 4.2 VC 0.5
2 11.5 VC 0.5
3 7.3 VC 0.5
4 5.8 VC 0.5
5 6.4 VC 0.5
6 10.0 VC 0.5
> ToothGrowth2<-mutate(ToothGrowth,len=len^2,nv=1:nrow(ToothGrowth),
nv2=ifelse(nv>median(nv),"H","L"))
> head(ToothGrowth2)
len supp dose nv nv2
1 17.64 VC 0.5 1 L
2 132.25 VC 0.5 2 L
3 53.29 VC 0.5 3 L
4 33.64 VC 0.5 4 L
5 40.96 VC 0.5 5 L
6 100.00 VC 0.5 6 L
> ToothGrowth3<-filter(ToothGrowth2,nv %in% 1:50,nv2=="H")
> head(ToothGrowth3)
len supp dose nv nv2
1 231.04 OJ 0.5 31 H
2 462.25 OJ 0.5 32 H
3 309.76 OJ 0.5 33 H
4 94.09 OJ 0.5 34 H
5 210.25 OJ 0.5 35 H
6 100.00 OJ 0.5 36 H
#只显示第二列和第四列的数据
ToothGrowth4 <- select(ToothGrowth3,c(2,4))
> summarise(ToothGrowth,len_max=max(len))
len_max
1 33.9
#会按照supp进行分组
> summarise(group_by(ToothGrowth,supp),len_max=max(len))
# A tibble: 2 × 2
supp len_max
1 OJ 30.9
2 VC 33.9
> summarise(group_by(ToothGrowth,dose),len_max=max(len))
# A tibble: 3 × 2
dose len_max
1 0.5 21.5
2 1 27.3
3 2 33.9
> summarise(group_by(ToothGrowth,dose,supp),len_max=max(len))
`summarise()` has grouped output by 'dose'. You can override using the `.groups`
argument.
# A tibble: 6 × 3
# Groups: dose [3]
dose supp len_max
1 0.5 OJ 21.5
2 0.5 VC 11.5
3 1 OJ 27.3
4 1 VC 22.5
5 2 OJ 30.9
6 2 VC 33.9
> library(magrittr)
#将左边的参数作为右边参数的第一个参数
> ToothGrowth %>%
+ mutate(nv=1:nrow(ToothGrowth))%>%
+ filter(nv %in% 1:50)%>%
+ select(1:2) %>%
+ group_by(supp)%>%
+ summarise(len_max=max(len))%>%
+ as.data.frame()
> df1<-data.frame(c1=2:5,c2=LETTERS[2:5])
> df1
c1 c2
1 2 B
2 3 C
3 4 D
4 5 E
> df2<-data.frame(c3=LETTERS[c(2:3,20:23)],c4=sample(1:100,size=6))
> df2
c3 c4
1 B 62
2 C 20
3 T 29
4 U 42
5 V 60
6 W 65
#left_join(df1,df2,by=c('c2'=='c3')):df1左连接于df2
> df1 %>% left_join(df2,by=c('c2'='c3'))
c1 c2 c4
1 2 B 62
2 3 C 20
3 4 D NA
4 5 E NA
> df1 %>% right_join(df2,by=c('c2'='c3'))
c1 c2 c4
1 2 B 62
2 3 C 20
3 NA T 29
4 NA U 42
5 NA V 60
6 NA W 65
> df1 %>% full_join(df2,by=c('c2'='c3'))
c1 c2 c4
1 2 B 62
2 3 C 20
3 4 D NA
4 5 E NA
5 NA T 29
6 NA U 42
7 NA V 60
8 NA W 65
> df1 %>% inner_join(df2,by=c('c2'='c3'))
c1 c2 c4
1 2 B 62
2 3 C 20
> library(tidyr)
> df3<-data.frame(c5=paste(letters[1:3],1:3,sep="-"),
+ c6=paste(letters[1:3],1:3,sep="."),
+ c4=c("B","B","B"),
+ c3=c("H","M","L"))
> df3
c5 c6 c4 c3
1 a-1 a.1 B H
2 b-2 b.2 B M
3 c-3 c.3 B L
> df4<-df3%>%
+ separate(col=c5,sep="-",into=c("c7","c8"),remove=F)%>%
+ separate(col=c6,sep="\\.",into=c("c9","c10"),remove=T)
#\\表示转义字符
> df4
c5 c7 c8 c9 c10 c4 c3
1 a-1 a 1 a 1 B H
2 b-2 b 2 b 2 B M
3 c-3 c 3 c 3 B L
> df4%>%
#remove表示将原来操作的列保留下来
+ unite(col="c11",c("c7","c8"),sep="_",remove=F)%>%
+ unite(col="c12",c("c9","c10"),sep=".",remove=T)
c5 c11 c7 c8 c12 c4 c3
1 a-1 a_1 a 1 a.1 B H
2 b-2 b_2 b 2 b.2 B M
3 c-3 c_3 c 3 c.3 B L
个人以为以下两个函数特别重要,特别有用,特别是在绘制线性图的时候
宽数据:列数变多了,names_from,表示列名的来源,value_from(),表示列值来源
从长数据到宽数据为from
长数据:行数变多了,names_to,表示列名加入到行中的去向,value_from(),表示列值加入到行中的去向,从宽数据到长数据为to
> set.seed(42)
> df5<-data.frame(time=rep(2011:2013,each=3),
+ area=rep(letters[1:3],times=3),
+ pop=sample(100:1000,9),
+ den=round(rnorm(9,mean=3,sd=0.1),2),
+ mj=sample(8:12,9,replace = T))
#replace=T,有放回的取
> df5
time area pop den mj
1 2011 a 660 2.99 12
2 2011 b 420 3.15 12
3 2011 c 252 2.99 11
4 2012 a 173 3.20 9
5 2012 b 327 2.99 11
6 2012 c 245 3.13 10
7 2013 a 733 3.23 9
8 2013 b 148 2.86 8
9 2013 c 227 2.97 9
> df6<-df5%>%
#将最后三列数据合并为一列
+ pivot_longer(cols=-c(1:2),
#列名称
+ names_to="varb",
#列值
+ values_to="value")
> df6
# A tibble: 27 × 4
time area varb value
1 2011 a pop 660
2 2011 a den 2.99
3 2011 a mj 12
4 2011 b pop 420
5 2011 b den 3.15
6 2011 b mj 12
7 2011 c pop 252
8 2011 c den 2.99
9 2011 c mj 11
10 2012 a pop 173
# ℹ 17 more rows
# ℹ Use `print(n = ...)` to see more rows
>
> df6%>%
+ pivot_wider(names_from=c(area,varb),values_from = value)
# A tibble: 3 × 10
time a_pop a_den a_mj b_pop b_den b_mj c_pop c_den c_mj
1 2011 660 2.99 12 420 3.15 12 252 2.99 11
2 2012 173 3.2 9 327 2.99 11 245 3.13 10
3 2013 733 3.23 9 148 2.86 8 227 2.97 9
>