数据框可以理解为一个高纬度的数组,不同的列可以包含不同的模式
如以下数据集(包括数值型、字符型、日期型等多种数据类型)可用数据框来创建
创建上述数据框
> patientID <- c(1,2,3,4)
> age <- c(25, 34, 28, 52)
> diabetes <- c("Type1", "Type2", "Type3", "Type2")
> > status <- c("poor", "Improved", "Excellent", "poor")
> myPatientsData <- data.frame(patientID, age, diabetes, status)
> myPatientsData
patientID age diabetes status
1 1 25 Type1 poor
2 2 34 Type2 Improved
3 3 28 Type3 Excellent
4 4 52 Type2 poor
获取
> myPatientsData[1:2]
patientID age
1 1 25
2 2 34
3 3 28
4 4 52
> myPatientsData[c("diabetes", "status")]
diabetes status
1 Type1 poor
2 Type2 Improved
3 Type3 Excellent
4 Type2 poor
> myPatientsData$age
[1] 25 34 28 52
> myPatientsData$score <- c(98,99,90,95)
> myPatientsData
patientID age diabetes status score
1 1 25 Type1 poor 98
2 2 34 Type2 Improved 99
3 3 28 Type3 Excellent 90
4 4 52 Type2 poor 95
> myPatientsData[which(myPatientsData$score == 99),]
patientID age diabetes status score
2 2 34 Type2 Improved 99
> myPatientsData[which(myPatientsData$status == "poor"),]
patientID age diabetes status score
1 1 25 Type1 poor 98
4 4 52 Type2 poor 95
lapply()
> lapply(dat$age,sum)
sapply()
> sapply(dat0$score,sum)
因子,名义型变量和有序型变量在R中称为因子
a<-factor(c('A','B','C','C','A'))
a
1
2
> b<-factor(c('A','B','C','D','C','A'),levels = c('A','B','C','D'),labels = c('A罩杯','B罩杯','C罩杯','D罩杯'))
> b
[1] A罩杯 B罩杯 C罩杯 D罩杯 C罩杯 A罩杯
Levels: A罩杯 B罩杯 C罩杯 D罩杯
> colour<-c('G','G','R','R','Y','G','G','Y','G','R','G')
> col<- factor(colour)
> col
[1] G G R R Y G G Y G R G
Levels: G R Y
> col1<-factor(colour,levels = c('G','R','Y'),labels = c('Green','Red','Yellow'))
> col1
[1] Green Green Red Red Yellow Green Green Yellow Green Red Green
Levels: Green Red Yellow
> col2<-factor(colour,levels = c('G','R','Y'),labels = c('1','2','3'))
> col2
[1] 1 1 2 2 3 1 1 3 1 2 1
Levels: 1 2 3
> class(col2)
[1] "factor"
> typeof(col2)
[1] "integer"
> mode(col2)
[1] "numeric"
as.vector(col2)
#ordered创建一个有序的因子
> score<-c('A','B','A','C','B')
> score1<-ordered(score)
> score1
[1] A B A C B
Levels: A < B < C
> score1<-ordered(score,levels=c('C','B','A'))
> score1
[1] A B A C B
Levels: C < B < A
#cut(创建有序型变量)
> exam<-c(98, 97, 52, 88, 85, 75, 97, 92, 77, 74, 70, 63, 97, 71, 98, 65, 79, 74, 58, 59, 60, 63, 87, 82, 95, 75, 79, 96, 50, 88)
> exam1<-cut(exam,breaks = 3)#分成三组
> exam1
[1] (82,98] (82,98] (50,66] (82,98] (82,98] (66,82] (82,98] (82,98] (66,82]
[10] (66,82] (66,82] (50,66] (82,98] (66,82] (82,98] (50,66] (66,82] (66,82]
[19] (50,66] (50,66] (50,66] (50,66] (82,98] (66,82] (82,98] (66,82] (66,82]
[28] (82,98] (50,66] (82,98]
Levels: (50,66] (66,82] (82,98]
#区间步长
(max(exam)-min(exam))/3
#常用函数tapply()
gender<-c('f','m','m','m','f')
age<-c(12,35,32,34,25)
#tapply(vector, index, function)
tapply(age, gender, mean)
列表,列表为一些对象的有序集合,包含向量,矩阵,数据框,或者其他列表。
> g <- "My first list"
> h <- c(12, 45, 43, 90)
> j <- matrix(1:10, nrow=2)
> k <- c("one", "two", "three")
> mylist <- list(g,h,j,k)
> mylist
[[1]]
[1] "My first list"
[[2]]
[1] 12 45 43 90
[[3]]
[,1] [,2] [,3] [,4] [,5]
[1,] 1 3 5 7 9
[2,] 2 4 6 8 10
[[4]]
[1] "one" "two" "three"
> mylist[[2]]
[1] 12 45 43 90
> g <- "My first list"
> h <- c(12, 45, 43, 90)
> j <- matrix(1:10, nrow=2)
> k <- c("one", "two", "three")
> mylist <- list(a=g, b=h, c=j, d=k)
> mylist$d
[1] "one" "two" "three"