数据的分类汇总是数据过滤、重组的重要内容,有时候我们需要将某些属性值计算到一类中,如果数据量非常大,R提供了快捷的计算方法,在处理工业企业数据库的过程中遇到一些问题,本文将数据改为鸢尾花数据,实际处理数据过程中遇到的问题罗列下来,并提出了解决方案。主要问题如下:
解决方案如下
本文研究数据为iris数据集,如果想对数据集进行进一步了解,可参考:iris数据集介绍
#extract data
> da <- iris[c(1:10,51:60,101:110),]
> da
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5.0 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
51 7.0 3.2 4.7 1.4 versicolor
52 6.4 3.2 4.5 1.5 versicolor
53 6.9 3.1 4.9 1.5 versicolor
54 5.5 2.3 4.0 1.3 versicolor
55 6.5 2.8 4.6 1.5 versicolor
56 5.7 2.8 4.5 1.3 versicolor
57 6.3 3.3 4.7 1.6 versicolor
58 4.9 2.4 3.3 1.0 versicolor
59 6.6 2.9 4.6 1.3 versicolor
60 5.2 2.7 3.9 1.4 versicolor
101 6.3 3.3 6.0 2.5 virginica
102 5.8 2.7 5.1 1.9 virginica
103 7.1 3.0 5.9 2.1 virginica
104 6.3 2.9 5.6 1.8 virginica
105 6.5 3.0 5.8 2.2 virginica
106 7.6 3.0 6.6 2.1 virginica
107 4.9 2.5 4.5 1.7 virginica
108 7.3 2.9 6.3 1.8 virginica
109 6.7 2.5 5.8 1.8 virginica
110 7.2 3.6 6.1 2.5 virginica
#aggregate data
#Calculate the sum value
> data_sum <- aggregate(Sepal.Length~Species,data=da,sum)
> names(data_sum)[2] <- "Sepal.Length_sum"
> data_sum
Species Sepal.Length_sum
1 setosa 48.6
2 versicolor 61.0
3 virginica 65.7
#calculate the max value
> data_max <- aggregate(Sepal.Width~Species,data = da,max)
> names(data_max)[2] <- "Sepal.Length_max"
> data_max
Species Sepal.Length_max
1 setosa 3.9
2 versicolor 3.3
3 virginica 3.6
#Calculate the average value
> data_mean<- aggregate(Petal.Length~Species,data = da,mean)
> names(data_mean)[2] <- "Sepal.Length_mean"
> data_mean
Species Sepal.Length_mean
1 setosa 1.45
2 versicolor 4.37
3 virginica 5.77
> #perserve the unique value of "Species"
> Species_unique <- da[!duplicated(da$Species),]
> Species_unique
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
51 7.0 3.2 4.7 1.4 versicolor
101 6.3 3.3 6.0 2.5 virginica
#order the value of Speal.Length
> da[order(da$Sepal.Length,decreasing = FALSE),] #ascending
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
9 4.4 2.9 1.4 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
7 4.6 3.4 1.4 0.3 setosa
3 4.7 3.2 1.3 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
58 4.9 2.4 3.3 1.0 versicolor
107 4.9 2.5 4.5 1.7 virginica
5 5.0 3.6 1.4 0.2 setosa
8 5.0 3.4 1.5 0.2 setosa
1 5.1 3.5 1.4 0.2 setosa
60 5.2 2.7 3.9 1.4 versicolor
6 5.4 3.9 1.7 0.4 setosa
54 5.5 2.3 4.0 1.3 versicolor
56 5.7 2.8 4.5 1.3 versicolor
102 5.8 2.7 5.1 1.9 virginica
57 6.3 3.3 4.7 1.6 versicolor
101 6.3 3.3 6.0 2.5 virginica
104 6.3 2.9 5.6 1.8 virginica
52 6.4 3.2 4.5 1.5 versicolor
55 6.5 2.8 4.6 1.5 versicolor
105 6.5 3.0 5.8 2.2 virginica
59 6.6 2.9 4.6 1.3 versicolor
109 6.7 2.5 5.8 1.8 virginica
53 6.9 3.1 4.9 1.5 versicolor
51 7.0 3.2 4.7 1.4 versicolor
103 7.1 3.0 5.9 2.1 virginica
110 7.2 3.6 6.1 2.5 virginica
108 7.3 2.9 6.3 1.8 virginica
106 7.6 3.0 6.6 2.1 virginica
> da[order(da$Sepal.Length,decreasing = TRUE),] #decresing
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
106 7.6 3.0 6.6 2.1 virginica
108 7.3 2.9 6.3 1.8 virginica
110 7.2 3.6 6.1 2.5 virginica
103 7.1 3.0 5.9 2.1 virginica
51 7.0 3.2 4.7 1.4 versicolor
53 6.9 3.1 4.9 1.5 versicolor
109 6.7 2.5 5.8 1.8 virginica
59 6.6 2.9 4.6 1.3 versicolor
55 6.5 2.8 4.6 1.5 versicolor
105 6.5 3.0 5.8 2.2 virginica
52 6.4 3.2 4.5 1.5 versicolor
57 6.3 3.3 4.7 1.6 versicolor
101 6.3 3.3 6.0 2.5 virginica
104 6.3 2.9 5.6 1.8 virginica
102 5.8 2.7 5.1 1.9 virginica
56 5.7 2.8 4.5 1.3 versicolor
54 5.5 2.3 4.0 1.3 versicolor
6 5.4 3.9 1.7 0.4 setosa
60 5.2 2.7 3.9 1.4 versicolor
1 5.1 3.5 1.4 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
8 5.0 3.4 1.5 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
58 4.9 2.4 3.3 1.0 versicolor
107 4.9 2.5 4.5 1.7 virginica
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
7 4.6 3.4 1.4 0.3 setosa
9 4.4 2.9 1.4 0.2 setosa
> da[order(-da$Sepal.Length,decreasing = FALSE),] #decresing
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
106 7.6 3.0 6.6 2.1 virginica
108 7.3 2.9 6.3 1.8 virginica
110 7.2 3.6 6.1 2.5 virginica
103 7.1 3.0 5.9 2.1 virginica
51 7.0 3.2 4.7 1.4 versicolor
53 6.9 3.1 4.9 1.5 versicolor
109 6.7 2.5 5.8 1.8 virginica
59 6.6 2.9 4.6 1.3 versicolor
55 6.5 2.8 4.6 1.5 versicolor
105 6.5 3.0 5.8 2.2 virginica
52 6.4 3.2 4.5 1.5 versicolor
57 6.3 3.3 4.7 1.6 versicolor
101 6.3 3.3 6.0 2.5 virginica
104 6.3 2.9 5.6 1.8 virginica
102 5.8 2.7 5.1 1.9 virginica
56 5.7 2.8 4.5 1.3 versicolor
54 5.5 2.3 4.0 1.3 versicolor
6 5.4 3.9 1.7 0.4 setosa
60 5.2 2.7 3.9 1.4 versicolor
1 5.1 3.5 1.4 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
8 5.0 3.4 1.5 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
58 4.9 2.4 3.3 1.0 versicolor
107 4.9 2.5 4.5 1.7 virginica
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
7 4.6 3.4 1.4 0.3 setosa
9 4.4 2.9 1.4 0.2 setosa
#order two values(Speal.Length and Species,Species is important)
> da[order(da$Species,da$Sepal.Length,decreasing = FALSE),]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
9 4.4 2.9 1.4 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
7 4.6 3.4 1.4 0.3 setosa
3 4.7 3.2 1.3 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
5 5.0 3.6 1.4 0.2 setosa
8 5.0 3.4 1.5 0.2 setosa
1 5.1 3.5 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
58 4.9 2.4 3.3 1.0 versicolor
60 5.2 2.7 3.9 1.4 versicolor
54 5.5 2.3 4.0 1.3 versicolor
56 5.7 2.8 4.5 1.3 versicolor
57 6.3 3.3 4.7 1.6 versicolor
52 6.4 3.2 4.5 1.5 versicolor
55 6.5 2.8 4.6 1.5 versicolor
59 6.6 2.9 4.6 1.3 versicolor
53 6.9 3.1 4.9 1.5 versicolor
51 7.0 3.2 4.7 1.4 versicolor
107 4.9 2.5 4.5 1.7 virginica
102 5.8 2.7 5.1 1.9 virginica
101 6.3 3.3 6.0 2.5 virginica
104 6.3 2.9 5.6 1.8 virginica
105 6.5 3.0 5.8 2.2 virginica
109 6.7 2.5 5.8 1.8 virginica
103 7.1 3.0 5.9 2.1 virginica
110 7.2 3.6 6.1 2.5 virginica
108 7.3 2.9 6.3 1.8 virginica
106 7.6 3.0 6.6 2.1 virginica
#calculate the max of "Speal.Length" in different Species
> da_dec <- da[order(da$Species,-da$Sepal.Length,decreasing = FALSE),] #First, order the value
> da_uni <- da_dec[!duplicated(da_dec$Species),] #Second, extact the unique value of "Species"
> da_uni #results
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
6 5.4 3.9 1.7 0.4 setosa
51 7.0 3.2 4.7 1.4 versicolor
106 7.6 3.0 6.6 2.1 virginica
#let Sepal.Length according to Species in a row
> library(dplyr)
> Species_summary <- da %>% group_by(Species) %>% summarise(su = paste(Sepal.Length,collapse = ","))
> Species_summary
# A tibble: 3 x 2
Species su
<fct> <chr>
1 setosa 5.1,4.9,4.7,4.6,5,5.4,4.6,5,4.4,4.9
2 versicolor 7,6.4,6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2
3 virginica 6.3,5.8,7.1,6.3,6.5,7.6,4.9,7.3,6.7,7.2