R语言data.table包高效处理/提升大数据效率@灵魂走风的江湖

目录

1、data.table包中特殊符号.SD(通过.SDcols选择的变量作处理)、.SDcols(变量列选择)、.I(返回位置)、.N(计数)、.BY、J、CJ、SJ、:=

2、分组计数(单一方法)比较:

3、多分组情况下比较 :

4、if_else和fifelse函数比较:

 5、row_number、with_groups与order、by综合比较: 

 6、字段拆分函数tstrsplit与str_split_fixed比较:

 7、fcase与case_when函数比较:


1、data.table包中特殊符号.SD(通过.SDcols选择的变量作处理)、.SDcols(变量列选择)、.I(返回位置)、.N(计数)、.BY、J、CJ、SJ、:=

2、分组计数(单一方法)比较:

sys_time_print(data_test%>% group_by(var) %>% tally())
#运行耗时:
[1] "Finished in 0.870s elapsed (0.670s cpu)"

sys_time_print(data_test[,.N,by=var])
#运行耗时:
[1] "Finished in 0.420s elapsed (0.560s cpu)"

3、多分组情况下比较 :

sys_time_print(test <- data_test[,var1_tag:=as_date(var1)]
[,.(var3_sum=sum(var3)),by="class1,id,var1_tag,var2"])
#运行耗时:
[1] "Finished in 2.500s elapsed (4.730s cpu)"


sys_time_print(test <- data_test[,var1_tag:=as_date(var1)]
[,.(var3_sum=sum(.SD)),by="class1,id,var1_tag,var2",.SDcol="var3"])
#运行耗时:耗时较长


sys_time_print(test <- data_test%>% 
mutate(var1_tag=as_date(var1))%>% 
group_by(class1,id,var1_tag,var2) %>%
summarise(var3_sum=sum(var3)))
#运行耗时:
[1] "Finished in 10.0s elapsed (9.690s cpu)"

4、if_else和fifelse函数比较:

fifelse()函数,可对照dplyr包if_else、软件内置ifelse函数计算效率
#dplyr包if_else函数
#对数据进行多变量判断、添加标签(数据量20w+)
data_test%>%
  mutate(final_tag=if_else(s1_rank==1,"tag1",
                           if_else(s2_rank==1,"tag2",
                                   if_else(s3_rank==1,"tag3",
                                          if_else(s4_rank==0,"no_tag","tag4")))))
#data.table包fifelse函数
data_test%>%
  mutate(final_tag=fiflese(s1_rank==1,"tag1",
                           fiflese(s2_rank==1,"tag2",
                                   fiflese(s3_rank==1,"tag3",
                                           fiflese(s4_rank==0,"no_tag","tag4")))))
if_else运行效果 fifelse运行效果

 5、row_number、with_groups与order、by综合比较: 

dplyr包 row_number、with_groups与data.table中排序(order、frank)、分组函数比较:
#对数据作分组排序后,取每组第1个值,需要保留所有变量(接近2000w数据量)
data_test[tag1=="tag_attr",][,tag_rank:=order(var_order,na.last=TRUE),by="class"][tag_rank==1,]

#运行耗时
[1] "Finished in 8.520s elapsed (8.390s cpu)"

data_test[tag1=="tag_attr",][,tag_rank:=order(var_order,na.last=TRUE),by="class"][,.SD[1],by="role_id"]
#运行耗时
[1] "Finished in 8.120s elapsed (8.140s cpu)"

data.table中排序函数frank函数在上面的计算规则下,相对较慢
#row_numbe、with_groups函数
data_test%>% 
  filter(tag1=="tag_attr") %>%
  with_groups(class,mutate,tag_rank=row_number(var_order)) %>% 
  filter(tag_rank==1)

#运行耗时
[1] "Finished in 9.920s elapsed (9.300s cpu)"

 6、字段拆分函数tstrsplit与str_split_fixed比较:

数据量1000w+
data_test%>%mutate(split_result=str_split_fixed(split_var," ",2)[,1])
#运行耗时
[1] "Finished in 27.3s elapsed (26.9s cpu)"

data_test[,split_result:=lapply(.SD,function(x) tstrsplit(x," ",
fixed=TRUE,keep=1L)),.SDcols="split_var"]
#运行耗时
[1] "Finished in 23.4s elapsed (23.1s cpu)"

 7、fcase与case_when函数比较:

data_test%>%mutate(var_tag=case_when(var1<=0~"first",
                                     var1>0&var1<=1~"second",
                                     var1>1&var1<=2~"third",
                                     var1>2&var1<=3~"fourth",
                                     var1>3&var1<=4~"fifth",
                                     var1>4&var1<=5~"sixth",
                                     var1>5&var1<=6~"seventh",
                                     var1>6&var1<=7~"eighth",
                                     var1>7~"other"))
#运行耗时
[1] "Finished in 00:01:01 elapsed (00:01:00 cpu)"


data_test[,var_tag:=data.table::fcase(.SD<=0,"first",
                                      .SD>0&.SD<=1,"second",
                                      .SD>1&.SD<=2,"third",
                                      .SD>2&.SD<=3,"fourth",
                                      .SD>3&.SD<=4,"fifth",
                                      .SD>4&.SD<=5,"sixth",
                                      .SD>5&.SD<=6,"seventh",
                                      .SD>6&.SD<=7,"eighth",
                                      .SD>7,"other"),.SDcols="var"])
#运行耗时
[1] "Finished in 56.2s elapsed (55.4s cpu)"


data_test[,var_tag:=lapply(.SD,function(x) data.table::fcase(x<=0,"first",
                                                             x>0&x<=1,"second",
                                                             x>1&x<=2,"third",
                                                             x>2&x<=3,"fourth",
                                                             x>3&x<=4,"fifth",
                                                             x>4&x<=5,"sixth",
                                                             x>5&x<=6,"seventh",
                                                             x>6&x<=7,"eighth",
                                                             default="other")),.SDcols="var"]
#运行耗时
[1] "Finished in 51.7s elapsed (50.7s cpu)"

 8、cube、rollup、groupingsets(相比group_by函数更加灵活):

多维度数据分组集成函数(可类比SQL中grouping sets用法)

#增加分类数据(用于试验)
iris_trans=iris %>% mutate(petal_sec=fifelse(Petal.Length<=2,"tag1",
                                             fifelse(Petal.Length>2&Petal.Length<=3,
                                                     "tag2","tag3")),
                           sepal_sec=fifelse(Sepal.Length<=2,"s1",
                                             fifelse(Sepal.Length>2&Sepal.Length<=4,
                                                     "s2","s3")))
#
rollup(as.data.table(iris_trans),j=.(sw_value=sum(Sepal.Width)),
 by = c("Species","petal_sec","sepal_sec")) #相当于梯度递减分组计算

#输出:
       Species petal_sec sepal_sec sw_value
                    
 1:     setosa      tag1        s2     81.7
 2:     setosa      tag1        s1     89.7
 3: versicolor      tag3        s2    129.3
 4: versicolor      tag3        s1      6.7
 5: versicolor      tag2        s2      2.5
 6:  virginica      tag3        s2    108.7
 7:  virginica      tag3        s3     37.5
 8:  virginica      tag3        s1      2.5
 9:     setosa      tag1          171.4
10: versicolor      tag3          136.0
11: versicolor      tag2            2.5
12:  virginica      tag3          148.7
13:     setosa                171.4
14: versicolor                138.5
15:  virginica                148.7
16:                       458.6



#
cube(as.data.table(iris_trans),j=.(sw_value=sum(Sepal.Width)),
 by = c("Species","petal_sec","sepal_sec")) #相当于组合分组计算


       Species petal_sec sepal_sec sw_value
                    
 1:     setosa      tag1        s2     81.7
 2:     setosa      tag1        s1     89.7
 3: versicolor      tag3        s2    129.3
 4: versicolor      tag3        s1      6.7
 5: versicolor      tag2        s2      2.5
 6:  virginica      tag3        s2    108.7
 7:  virginica      tag3        s3     37.5
 8:  virginica      tag3        s1      2.5
 9:     setosa      tag1          171.4
10: versicolor      tag3          136.0
11: versicolor      tag2            2.5
12:  virginica      tag3          148.7
13:     setosa              s2     81.7
14:     setosa              s1     89.7
15: versicolor              s2    131.8
16: versicolor              s1      6.7
17:  virginica              s2    108.7
18:  virginica              s3     37.5
19:  virginica              s1      2.5
20:     setosa                171.4
21: versicolor                138.5
22:  virginica                148.7
23:             tag1        s2     81.7
24:             tag1        s1     89.7
25:             tag3        s2    238.0
26:             tag3        s1      9.2
27:             tag2        s2      2.5
28:             tag3        s3     37.5
29:             tag1          171.4
30:             tag3          284.7
31:             tag2            2.5
32:                     s2    322.2
33:                     s1     98.9
34:                     s3     37.5
35:                       458.6

#
groupingsets(as.data.table(iris_trans),
             j=.(sw_value=sum(Sepal.Width)), 
             by = c("Species","petal_sec","sepal_sec"),
             sets=list("Species",c("Species","petal_sec","sepal_sec")))
#相当于指定维度集合分组计算

       Species petal_sec sepal_sec sw_value
                    
 1:     setosa                171.4
 2: versicolor                138.5
 3:  virginica                148.7
 4:     setosa      tag1        s2     81.7
 5:     setosa      tag1        s1     89.7
 6: versicolor      tag3        s2    129.3
 7: versicolor      tag3        s1      6.7
 8: versicolor      tag2        s2      2.5
 9:  virginica      tag3        s2    108.7
10:  virginica      tag3        s3     37.5
11:  virginica      tag3        s1      2.5

将继续更新该包函数(frollapply、rleid、shift、key等)效率比较。

你可能感兴趣的:(R语言,实用随笔,R语言高效处理数据,大数据,R语言,data.table包)