使用自带的summary()函数
> myvars <- c("mpg","hp","wt")
> summary(mtcars[myvars])
mpg hp wt
Min. :10.40 Min. : 52.0 Min. :1.513
1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581
Median :19.20 Median :123.0 Median :3.325
Mean :20.09 Mean :146.7 Mean :3.217
3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610
Max. :33.90 Max. :335.0 Max. :5.424
使用apply()或sapply()函数计算所选择的任意描述性统计量,函数形式为:
sapply(x, FUN, options)
这里插入的典型函数有mean()、sd()、var()、min()、max()、median()、length()、range()和quantile()。函数fivenum()可返回图基五数。
基础安装并没有封装偏度和峰度的计算函数,这里手撸一个。
> mystats <- function(x, na.omit = FALSE){
+ if (na.omit)
+ x <- x[!is.na(x)]
+ m <- mean(x)
+ n <- length(x)
+ s <- sd(x)
+ skew <- sum((x-m)^3/s^3)/n
+ kurt <- sum((x-m)^4/s^4)/n - 3
+ return(c(n=n, mean=m, stdev=s, skew=skew,kurtosis=kurt))
+ }
>
> myvars <- c("mpg","hp","wt")
> sapply(mtcars[myvars], mystats)
mpg hp wt
n 32.000000 32.0000000 32.00000000
mean 20.090625 146.6875000 3.21725000
stdev 6.026948 68.5628685 0.97845744
skew 0.610655 0.7260237 0.42314646
kurtosis -0.372766 -0.1355511 -0.02271075
比较全面的有pastecs包,其中的describe()函数:
> library(psych)
>
> myvars <- c("mpg","hp","wt")
>
> describe(mtcars[myvars])
vars n mean sd median trimmed mad min max range skew kurtosis
mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61 -0.37
hp 2 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73 -0.14
wt 3 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42 -0.02
se
mpg 1.07
hp 12.12
wt 0.17
分组计算描述性统计量
可以使用aggregate()函数来分组获取描述性统计量。
> myvars <- c("mpg","hp","wt")
>
> aggregate(mtcars[myvars], by=list(am=mtcars$am), mean)
> #使用by=list(am=mtcars$am)可以使新列被标注为am而不是Group.1
am mpg hp wt
1 0 17.14737 160.2632 3.768895
2 1 24.39231 126.8462 2.411000
但是aggregate()函数无法一次返回若干个统计量。一种解决的办法是使用by()函数。
by(data, INDICES, FUN)
> myvars <- c("mpg","hp","wt")
>
> mystats <- function(x, na.omit = FALSE){
+ if (na.omit)
+ x <- x[!is.na(x)]
+ m <- mean(x)
+ n <- length(x)
+ s <- sd(x)
+ skew <- sum((x-m)^3/s^3)/n
+ kurt <- sum((x-m)^4/s^4)/n - 3
+ return(c(n=n, mean=m, stdev=s, skew=skew,kurtosis=kurt))
+ }
>
> dstats <- function(x)sapply(x, mystats)
>
> by(mtcars[myvars], mtcars$am, dstats)
mtcars$am: 0
mpg hp wt
n 19.00000000 19.00000000 19.0000000
mean 17.14736842 160.26315789 3.7688947
stdev 3.83396639 53.90819573 0.7774001
skew 0.01395038 -0.01422519 0.9759294
kurtosis -0.80317826 -1.20969733 0.1415676
---------------------------------------------------------------
mtcars$am: 1
mpg hp wt
n 13.00000000 13.0000000 13.0000000
mean 24.39230769 126.8461538 2.4110000
stdev 6.16650381 84.0623243 0.6169816
skew 0.05256118 1.3598859 0.2103128
kurtosis -1.45535200 0.5634635 -1.1737358
也可以利用doBy()包的summaryBy()函数分组计算概述统计量。
> library(doBy)
>
> myvars <- c("mpg","hp","wt")
>
> mystats <- function(x, na.omit = FALSE){
+ if (na.omit)
+ x <- x[!is.na(x)]
+ m <- mean(x)
+ n <- length(x)
+ s <- sd(x)
+ skew <- sum((x-m)^3/s^3)/n
+ kurt <- sum((x-m)^4/s^4)/n - 3
+ return(c(n=n, mean=m, stdev=s, skew=skew,kurtosis=kurt))
+ }
>
> summary(summaryBy(mpg+hp+wt~am, data=mtcars, FUN=mystats))
am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis
Min. :0.00 Min. :13.0 Min. :17.15 Min. :3.834 Min. :0.01395 Min. :-1.4554
1st Qu.:0.25 1st Qu.:14.5 1st Qu.:18.96 1st Qu.:4.417 1st Qu.:0.02360 1st Qu.:-1.2923
Median :0.50 Median :16.0 Median :20.77 Median :5.000 Median :0.03326 Median :-1.1293
Mean :0.50 Mean :16.0 Mean :20.77 Mean :5.000 Mean :0.03326 Mean :-1.1293
3rd Qu.:0.75 3rd Qu.:17.5 3rd Qu.:22.58 3rd Qu.:5.583 3rd Qu.:0.04291 3rd Qu.:-0.9662
Max. :1.00 Max. :19.0 Max. :24.39 Max. :6.167 Max. :0.05256 Max. :-0.8032
hp.n hp.mean hp.stdev hp.skew hp.kurtosis
Min. :13.0 Min. :126.8 Min. :53.91 Min. :-0.01423 Min. :-1.2097
1st Qu.:14.5 1st Qu.:135.2 1st Qu.:61.45 1st Qu.: 0.32930 1st Qu.:-0.7664
Median :16.0 Median :143.6 Median :68.99 Median : 0.67283 Median :-0.3231
Mean :16.0 Mean :143.6 Mean :68.99 Mean : 0.67283 Mean :-0.3231
3rd Qu.:17.5 3rd Qu.:151.9 3rd Qu.:76.52 3rd Qu.: 1.01636 3rd Qu.: 0.1202
Max. :19.0 Max. :160.3 Max. :84.06 Max. : 1.35989 Max. : 0.5635
wt.n wt.mean wt.stdev wt.skew wt.kurtosis
Min. :13.0 Min. :2.411 Min. :0.6170 Min. :0.2103 Min. :-1.1737
1st Qu.:14.5 1st Qu.:2.750 1st Qu.:0.6571 1st Qu.:0.4017 1st Qu.:-0.8449
Median :16.0 Median :3.090 Median :0.6972 Median :0.5931 Median :-0.5161
Mean :16.0 Mean :3.090 Mean :0.6972 Mean :0.5931 Mean :-0.5161
3rd Qu.:17.5 3rd Qu.:3.429 3rd Qu.:0.7373 3rd Qu.:0.7845 3rd Qu.:-0.1873
Max. :19.0 Max. :3.769 Max. :0.7774 Max. :0.9759 Max. : 0.1416
library(vcd)
head(Arthritis)
#一维列联表
mytable <- with(Arthritis,table(Improved))
mytable
#二维列联表
mytable <- xtabs(~ Treatment+Improved, data = Arthritis)
mytable
#使用margin.table()和prop.table()函数分别生成边际频数和比例
margin.table(mytable,1) #下标1指代table()语句中的第一个变量
prop.table(mytable,1)
margin.table(mytable,2)
prop.table(mytable,2)
#多维列联表
mytable <- xtabs(~ Treatment+Sex+Improved, data = Arthritis)
mytable