目录
1、data.table包中特殊符号.SD(通过.SDcols选择的变量作处理)、.SDcols(变量列选择)、.I(返回位置)、.N(计数)、.BY、J、CJ、SJ、:=
2、分组计数(单一方法)比较:
3、多分组情况下比较 :
4、if_else和fifelse函数比较:
5、row_number、with_groups与order、by综合比较:
6、字段拆分函数tstrsplit与str_split_fixed比较:
7、fcase与case_when函数比较:
sys_time_print(data_test%>% group_by(var) %>% tally())
#运行耗时:
[1] "Finished in 0.870s elapsed (0.670s cpu)"
sys_time_print(data_test[,.N,by=var])
#运行耗时:
[1] "Finished in 0.420s elapsed (0.560s cpu)"
sys_time_print(test <- data_test[,var1_tag:=as_date(var1)]
[,.(var3_sum=sum(var3)),by="class1,id,var1_tag,var2"])
#运行耗时:
[1] "Finished in 2.500s elapsed (4.730s cpu)"
sys_time_print(test <- data_test[,var1_tag:=as_date(var1)]
[,.(var3_sum=sum(.SD)),by="class1,id,var1_tag,var2",.SDcol="var3"])
#运行耗时:耗时较长
sys_time_print(test <- data_test%>%
mutate(var1_tag=as_date(var1))%>%
group_by(class1,id,var1_tag,var2) %>%
summarise(var3_sum=sum(var3)))
#运行耗时:
[1] "Finished in 10.0s elapsed (9.690s cpu)"
fifelse()函数,可对照dplyr包if_else、软件内置ifelse函数计算效率
#dplyr包if_else函数
#对数据进行多变量判断、添加标签(数据量20w+)
data_test%>%
mutate(final_tag=if_else(s1_rank==1,"tag1",
if_else(s2_rank==1,"tag2",
if_else(s3_rank==1,"tag3",
if_else(s4_rank==0,"no_tag","tag4")))))
#data.table包fifelse函数
data_test%>%
mutate(final_tag=fiflese(s1_rank==1,"tag1",
fiflese(s2_rank==1,"tag2",
fiflese(s3_rank==1,"tag3",
fiflese(s4_rank==0,"no_tag","tag4")))))
if_else运行效果 fifelse运行效果
dplyr包 row_number、with_groups与data.table中排序(order、frank)、分组函数比较:
#对数据作分组排序后,取每组第1个值,需要保留所有变量(接近2000w数据量)
data_test[tag1=="tag_attr",][,tag_rank:=order(var_order,na.last=TRUE),by="class"][tag_rank==1,]
#运行耗时
[1] "Finished in 8.520s elapsed (8.390s cpu)"
data_test[tag1=="tag_attr",][,tag_rank:=order(var_order,na.last=TRUE),by="class"][,.SD[1],by="role_id"]
#运行耗时
[1] "Finished in 8.120s elapsed (8.140s cpu)"
data.table中排序函数frank函数在上面的计算规则下,相对较慢
#row_numbe、with_groups函数
data_test%>%
filter(tag1=="tag_attr") %>%
with_groups(class,mutate,tag_rank=row_number(var_order)) %>%
filter(tag_rank==1)
#运行耗时
[1] "Finished in 9.920s elapsed (9.300s cpu)"
数据量1000w+
data_test%>%mutate(split_result=str_split_fixed(split_var," ",2)[,1])
#运行耗时
[1] "Finished in 27.3s elapsed (26.9s cpu)"
data_test[,split_result:=lapply(.SD,function(x) tstrsplit(x," ",
fixed=TRUE,keep=1L)),.SDcols="split_var"]
#运行耗时
[1] "Finished in 23.4s elapsed (23.1s cpu)"
data_test%>%mutate(var_tag=case_when(var1<=0~"first",
var1>0&var1<=1~"second",
var1>1&var1<=2~"third",
var1>2&var1<=3~"fourth",
var1>3&var1<=4~"fifth",
var1>4&var1<=5~"sixth",
var1>5&var1<=6~"seventh",
var1>6&var1<=7~"eighth",
var1>7~"other"))
#运行耗时
[1] "Finished in 00:01:01 elapsed (00:01:00 cpu)"
data_test[,var_tag:=data.table::fcase(.SD<=0,"first",
.SD>0&.SD<=1,"second",
.SD>1&.SD<=2,"third",
.SD>2&.SD<=3,"fourth",
.SD>3&.SD<=4,"fifth",
.SD>4&.SD<=5,"sixth",
.SD>5&.SD<=6,"seventh",
.SD>6&.SD<=7,"eighth",
.SD>7,"other"),.SDcols="var"])
#运行耗时
[1] "Finished in 56.2s elapsed (55.4s cpu)"
data_test[,var_tag:=lapply(.SD,function(x) data.table::fcase(x<=0,"first",
x>0&x<=1,"second",
x>1&x<=2,"third",
x>2&x<=3,"fourth",
x>3&x<=4,"fifth",
x>4&x<=5,"sixth",
x>5&x<=6,"seventh",
x>6&x<=7,"eighth",
default="other")),.SDcols="var"]
#运行耗时
[1] "Finished in 51.7s elapsed (50.7s cpu)"
多维度数据分组集成函数(可类比SQL中grouping sets用法)
#增加分类数据(用于试验)
iris_trans=iris %>% mutate(petal_sec=fifelse(Petal.Length<=2,"tag1",
fifelse(Petal.Length>2&Petal.Length<=3,
"tag2","tag3")),
sepal_sec=fifelse(Sepal.Length<=2,"s1",
fifelse(Sepal.Length>2&Sepal.Length<=4,
"s2","s3")))
#
rollup(as.data.table(iris_trans),j=.(sw_value=sum(Sepal.Width)),
by = c("Species","petal_sec","sepal_sec")) #相当于梯度递减分组计算
#输出:
Species petal_sec sepal_sec sw_value
1: setosa tag1 s2 81.7
2: setosa tag1 s1 89.7
3: versicolor tag3 s2 129.3
4: versicolor tag3 s1 6.7
5: versicolor tag2 s2 2.5
6: virginica tag3 s2 108.7
7: virginica tag3 s3 37.5
8: virginica tag3 s1 2.5
9: setosa tag1 171.4
10: versicolor tag3 136.0
11: versicolor tag2 2.5
12: virginica tag3 148.7
13: setosa 171.4
14: versicolor 138.5
15: virginica 148.7
16: 458.6
#
cube(as.data.table(iris_trans),j=.(sw_value=sum(Sepal.Width)),
by = c("Species","petal_sec","sepal_sec")) #相当于组合分组计算
Species petal_sec sepal_sec sw_value
1: setosa tag1 s2 81.7
2: setosa tag1 s1 89.7
3: versicolor tag3 s2 129.3
4: versicolor tag3 s1 6.7
5: versicolor tag2 s2 2.5
6: virginica tag3 s2 108.7
7: virginica tag3 s3 37.5
8: virginica tag3 s1 2.5
9: setosa tag1 171.4
10: versicolor tag3 136.0
11: versicolor tag2 2.5
12: virginica tag3 148.7
13: setosa s2 81.7
14: setosa s1 89.7
15: versicolor s2 131.8
16: versicolor s1 6.7
17: virginica s2 108.7
18: virginica s3 37.5
19: virginica s1 2.5
20: setosa 171.4
21: versicolor 138.5
22: virginica 148.7
23: tag1 s2 81.7
24: tag1 s1 89.7
25: tag3 s2 238.0
26: tag3 s1 9.2
27: tag2 s2 2.5
28: tag3 s3 37.5
29: tag1 171.4
30: tag3 284.7
31: tag2 2.5
32: s2 322.2
33: s1 98.9
34: s3 37.5
35: 458.6
#
groupingsets(as.data.table(iris_trans),
j=.(sw_value=sum(Sepal.Width)),
by = c("Species","petal_sec","sepal_sec"),
sets=list("Species",c("Species","petal_sec","sepal_sec")))
#相当于指定维度集合分组计算
Species petal_sec sepal_sec sw_value
1: setosa 171.4
2: versicolor 138.5
3: virginica 148.7
4: setosa tag1 s2 81.7
5: setosa tag1 s1 89.7
6: versicolor tag3 s2 129.3
7: versicolor tag3 s1 6.7
8: versicolor tag2 s2 2.5
9: virginica tag3 s2 108.7
10: virginica tag3 s3 37.5
11: virginica tag3 s1 2.5
将继续更新该包函数(frollapply、rleid、shift、key等)效率比较。