我们来解决这样一个问题:将学生的各科考试成绩组合为单一的成绩衡量制表,基于相对名次给出从A到F的评分,根据学生姓氏和名称的首字母对花名册进行排序。
# 数据输入
> options(digits=2) # 限制输出小数点后数字的位数
> Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose", "David Jones", "Janice Markhammer", "Cheryl Cushing", "Reuven Ytzrhak", "Greg Knox", "Joel England", "Mary Rayburn")
> Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 532)
> Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
> English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
> roster <- data.frame(Student, Math, Science, English, stringsAsFactors = FALSE)
> roster
Student Math Science English
1 John Davis 502 95 25
2 Angela Williams 600 99 22
3 Bullwinkle Moose 412 80 18
4 David Jones 358 82 15
5 Janice Markhammer 495 75 20
6 Cheryl Cushing 512 85 28
7 Reuven Ytzrhak 410 80 15
8 Greg Knox 625 95 30
9 Joel England 573 89 27
10 Mary Rayburn 532 86 18
# 将各科成绩标准化(用单位标准差表示),用scale()函数实现
> z <- scale(roster[, 2:4])
> z
Math Science English
[1,] 0.0011 1.078 0.587
[2,] 1.1276 1.591 0.037
[3,] -1.0333 -0.847 -0.697
[4,] -1.6540 -0.590 -1.247
[5,] -0.0793 -1.489 -0.330
[6,] 0.1161 -0.205 1.137
[7,] -1.0563 -0.847 -1.247
[8,] 1.4149 1.078 1.504
[9,] 0.8172 0.308 0.954
[10,] 0.3460 -0.077 -0.697
attr(,"scaled:center")
Math Science English
502 87 22
attr(,"scaled:scale")
Math Science English
87.0 7.8 5.5
# 求各行的均值以获得综合得分
> score <- apply(z, 1, mean)
> roster <- cbind(roster, score)
> roster
Student Math Science English score
1 John Davis 502 95 25 0.56
2 Angela Williams 600 99 22 0.92
3 Bullwinkle Moose 412 80 18 -0.86
4 David Jones 358 82 15 -1.16
5 Janice Markhammer 495 75 20 -0.63
6 Cheryl Cushing 512 85 28 0.35
7 Reuven Ytzrhak 410 80 15 -1.05
8 Greg Knox 625 95 30 1.33
9 Joel England 573 89 27 0.69
10 Mary Rayburn 532 86 18 -0.14
# 用quantile()函数计算学生综合得分的百分位数
> y <- quantile(roster$score, c(.8,.6,.4,.2))
> y
80% 60% 40% 20%
0.74 0.43 -0.34 -0.90
# 按照百分位数排名重编码成绩
> roster$grade[score >= y[1]] <- 'A'
> roster$grade[score < y[1] & score >= y[2]] <- 'B'
> roster$grade[score < y[2] & score >= y[3]] <- 'C'
> roster$grade[score < y[3] & score >= y[4]] <- 'D'
> roster$grade[score < y[4] ]<- 'F'
> roster
Student Math Science English score grade
1 John Davis 502 95 25 0.56 B
2 Angela Williams 600 99 22 0.92 A
3 Bullwinkle Moose 412 80 18 -0.86 D
4 David Jones 358 82 15 -1.16 F
5 Janice Markhammer 495 75 20 -0.63 D
6 Cheryl Cushing 512 85 28 0.35 C
7 Reuven Ytzrhak 410 80 15 -1.05 F
8 Greg Knox 625 95 30 1.33 A
9 Joel England 573 89 27 0.69 B
10 Mary Rayburn 532 86 18 -0.14 C
# 以空格为界将姓名拆分
> name <- strsplit((roster$Student), " ")
> name
[[1]]
[1] "John" "Davis"
[[2]]
[1] "Angela" "Williams"
[[3]]
[1] "Bullwinkle" "Moose"
[[4]]
[1] "David" "Jones"
[[5]]
[1] "Janice" "Markhammer"
[[6]]
[1] "Cheryl" "Cushing"
[[7]]
[1] "Reuven" "Ytzrhak"
[[8]]
[1] "Greg" "Knox"
[[9]]
[1] "Joel" "England"
[[10]]
[1] "Mary" "Rayburn"
# 分别将姓名存储,并去掉无用的student列。"["是一个可以用来提取某个对象的一部分的函数
> Firstname <- sapply(name, "[", 1)
> Lastname <- sapply(name, "[", 2)
> roster <- cbind(Firstname, Lastname, roster[, -1])
> roster
Firstname Lastname Math Science English score grade
1 John Davis 502 95 25 0.56 B
2 Angela Williams 600 99 22 0.92 A
3 Bullwinkle Moose 412 80 18 -0.86 D
4 David Jones 358 82 15 -1.16 F
5 Janice Markhammer 495 75 20 -0.63 D
6 Cheryl Cushing 512 85 28 0.35 C
7 Reuven Ytzrhak 410 80 15 -1.05 F
8 Greg Knox 625 95 30 1.33 A
9 Joel England 573 89 27 0.69 B
10 Mary Rayburn 532 86 18 -0.14 C
# 用order()函数进行排序
> roster[order(Lastname, Firstname), ]
Firstname Lastname Math Science English score grade
6 Cheryl Cushing 512 85 28 0.35 C
1 John Davis 502 95 25 0.56 B
9 Joel England 573 89 27 0.69 B
4 David Jones 358 82 15 -1.16 F
8 Greg Knox 625 95 30 1.33 A
5 Janice Markhammer 495 75 20 -0.63 D
3 Bullwinkle Moose 412 80 18 -0.86 D
10 Mary Rayburn 532 86 18 -0.14 C
2 Angela Williams 600 99 22 0.92 A
7 Reuven Ytzrhak 410 80 15 -1.05 F
将一个函数应用到矩阵的所有行列
> mydata <- matrix(rnorm(30), nrow=6)
> mydata
[,1] [,2] [,3] [,4] [,5]
[1,] -0.06125496 0.4988079 -0.2929864 -0.4886596 -1.0619015
[2,] 0.25143001 0.3055695 0.2592611 1.7845242 1.0919724
[3,] 0.69396311 0.5974815 -0.7969848 -0.0540765 -0.9713497
[4,] 0.12294368 1.0399471 2.2633374 0.3299851 0.3274629
[5,] 1.36734800 -0.4483960 -0.5536991 -0.7941322 0.2633292
[6,] 0.13667905 1.7121611 0.7215101 2.0211705 1.8452035
> apply(mydata, 1, mean) # 1 means row
[1] -0.28119891 0.73855144 -0.10619327 0.81673523 -0.03311001
[6] 1.28734486
> apply(mydata, 2, mean) # 2 means col
[1] 0.4185181 0.6175952 0.2667397 0.4664686 0.2491194