R包中的nutshell,2006年出生的数据,数据名为birth2006.smpl
> #CHAPTER 2: PROCESSING THE INFORMATION AND GETTING TO KNOW YOUR DATA
> #Example 1: 2006 Birth Data
> ## Install packages from CRAN; use any USA mirror
> library(lattice)
> library(nutshell)
> data(births2006.smpl) #每个安装包都会包含一定的数据.data(),将数据导入到工作环境中
> births2006.smpl[1:5,]
DOB_MM DOB_WK MAGER TBO_REC WTGAIN SEX APGAR5 DMEDUC UPREVIS ESTGEST
591430 9 1 25 2 NA F NA NULL 10 99
1827276 2 6 28 2 26 M 9 2 years of college 10 37
1705673 2 2 18 2 25 F 9 NULL 14 38
3368269 10 5 21 2 6 M 9 NULL 22 38
2990253 7 7 25 1 36 M 10 2 years of high school 15 40
DMETH_REC DPLURAL DBWT
591430 Vaginal 1 Single 3800
1827276 Vaginal 1 Single 3625
1705673 Vaginal 1 Single 3650
3368269 Vaginal 1 Single 3045
2990253 Vaginal 1 Single 3827
> dim(births2006.smpl)
[1] 427323 13
> births.dow <- table(births2006.smpl$DOB_WK)
> births.dow
1 2 3 4 5 6 7
40274 62757 69775 70290 70164 68380 45683
> barchart(births.dow,ylab='Day of Week',col='black')
解说:
*休息日分娩的人数要远少于工作日分娩的人数,据了解,剖腹产(C-section)一般是在工作日进行
> unique(births2006.smpl$DMETH_REC)
[1] Vaginal C-section Unknown
Levels: C-section Unknown Vaginal
> dob.dm.tbl <- table(WK = births2006.smpl$DOB_WK,
+ MM = births2006.smpl$DMETH_REC)
> dob.dm.tbl
MM
WK C-section Unknown Vaginal
1 8836 90 31348
2 20454 272 42031
3 22921 247 46607
4 23103 252 46935
5 22825 258 47081
6 23233 289 44858
7 10696 109 34878
> dob.dm.tbl <- dob.dm.tbl[,-2]
> dob.dm.tbl
MM
WK C-section Vaginal
1 8836 31348
2 20454 42031
3 22921 46607
4 23103 46935
5 22825 47081
6 23233 44858
7 10696 34878
> trellis.device() #这句是打开一个画图界面,如果题头显示(active)表示激活状态,画图都会在该界面呈现,否则就在Rstudio的plots界面中呈现。
> barchart(dob.dm.tbl,ylab = 'Day of Week') #图一
> barchart(dob.dm.tbl,horizontal = FALSE, groups = FALSE,
+ xlab = 'Day of Week',col = 'black') #图二
解说:
> births.dow <- table(births2006.smpl$DOB_WK)
> births.dow
1 2 3 4 5 6 7
40274 62757 69775 70290 70164 68380 45683
> barchart(births.dow,ylab='Day of Week',col='black')
> dob.dm.tbl <- table(WK = births2006.smpl$DOB_WK,
+ MM = births2006.smpl$DMETH_REC)
> dob.dm.tbl
MM
WK C-section Unknown Vaginal
1 8836 90 31348
2 20454 272 42031
3 22921 247 46607
4 23103 252 46935
5 22825 258 47081
6 23233 289 44858
7 10696 109 34878
> dob.dm.tbl <- dob.dm.tbl[,-2]
> dob.dm.tbl
> barchart(dob.dm.tbl,ylab = 'Day of Week') #图一,堆砌在一起
> barchart(dob.dm.tbl,horizontal = FALSE, groups = FALSE,
+ xlab = 'Day of Week',col = 'black') #图二,分别画
> histogram(~DBWT|DPLURAL,data = births2006.smpl,layout = c(1,5),
+ col = 'black') #layout = c(1,5)表示1列5行
> densityplot(~DBWT|DPLURAL,data = births2006.smpl,layout = c(1,5),
+ plot.points = FALSE,col = 'black')
> densityplot(~DBWT,groups = DPLURAL,data = births2006.smpl,
+ plot.points = FALSE)
dotplot(~数值型变量| 分类型变量,data = ,layout =) #1
⇒ \Rightarrow ⇒ “|”表示条件,一旦有这个,就表示对数据按照某分类变量进行分组
xyplot(数值型变量~分类型变量,data= ,col=“black”) #2
⇒ \Rightarrow ⇒ 相当于y~分类型变量,并没有进行分组
xyplot(数值型变量~分类型变量2|分类型变量1, data = ,layout =) #3
⇒ \Rightarrow ⇒ 在分类型变量1的前提下,进行y~分类变量2
xyplot(数值型变量~数值型变量,data= ,col=“black”) #4
⇒ \Rightarrow ⇒ 相当于y~x散点图,并没有进行分组
xyplot(数值型变量~数值型变量|分类型变量1, data = ,layout =) #5
⇒ \Rightarrow ⇒ 在分类型变量1的前提下,进行y~x散点图
smoothScatter(x,y) #6
⇒ \Rightarrow ⇒ 平滑散点图
> dotplot(~DBWT|DPLURAL,data=births2006.smpl,layout=c(1,5),plot.points=FALSE,col="black") #1
> xyplot(DBWT~DOB_WK,data=births2006.smpl,col="black") #2
> xyplot(DBWT~DOB_WK|DPLURAL,data=births2006.smpl,layout=c(1,5),col="black") #3
> xyplot(DBWT~WTGAIN,data=births2006.smpl,col="black") #4
> xyplot(DBWT~WTGAIN|DPLURAL,data=births2006.smpl,layout=c(1,5),col="black") #5
> smoothScatter(births2006.smpl$WTGAIN,births2006.smpl$DBWT) #6
常规不用加载包,boxplot函数
boxplot(数值型变量~数值型变量, data= )
boxplot(数值型变量~分类型变量, data= )
⇒ \Rightarrow ⇒ 不同之处在于横轴是分类型还是数值型
在lattice包下,bwplot()函数
bwplot(数值型变量~factor(分类型变量)|factor(分类型变量),data = )
bwplot(数值型变量~factor(分类型变量), data = )
⇒ \Rightarrow ⇒ 要对分类型变量进行factor;可以进行条件前提设置 → \rightarrow →"|"
> boxplot(DBWT~APGAR5,data=births2006.smpl,ylab="DBWT",xlab="AGPAR5")
> boxplot(DBWT~DOB_WK,data=births2006.smpl,ylab="DBWT",xlab="Day of Week")
> bwplot(DBWT~factor(APGAR5)|factor(SEX),data=births2006.smpl,xlab="AGPAR5")
> bwplot(DBWT~factor(DOB_WK),data=births2006.smpl,xlab="Day of Week")
tapply(vector, index, function),
⇒ \Rightarrow ⇒vector 可以是数值,index要变成因子/分类型变量,function是对vector进行的函数操作,可以是mean,max,min等
> fac=factor(births2006.smpl$DPLURAL)
> res=births2006.smpl$DBWT
> t4=tapply(res,fac,mean,na.rm=TRUE)
> t4
1 Single 2 Twin
3298.263 2327.478
3 Triplet 4 Quadruplet
1677.017 1196.105
5 Quintuplet or higher
1142.800
> t5=tapply(births2006.smpl$DBWT,INDEX=list(births2006.smpl$DPLURAL,births2006.smpl$SEX),FUN=mean,na.rm=TRUE)
> t5
F M
1 Single 3242.302 3351.637
2 Twin 2279.508 2373.819
3 Triplet 1697.822 1655.348
4 Quadruplet 1319.556 1085.000
5 Quintuplet or higher 1007.667 1345.500
> barplot(t4,ylab="DBWT")
> barplot(t5,beside=TRUE,ylab="DBWT")
> t5=table(births2006.smpl$ESTGEST)
> t5
12 15 17 18 19 20 21 22 23
1 2 18 43 69 116 162 209 288
24 25 26 27 28 29 30 31 32
401 445 461 566 670 703 1000 1243 1975
33 34 35 36 37 38 39 40 41
2652 4840 7954 15874 33310 76794 109046 84890 23794
42 43 44 45 46 47 48 51 99
1931 133 32 6 5 5 2 1 57682
> new=births2006.smpl[births2006.smpl$ESTGEST != 99,] #new去掉怀孕周数未知为99的数据集
> t51=table(new$ESTGEST)
> t51
12 15 17 18 19 20 21 22 23
1 2 18 43 69 116 162 209 288
24 25 26 27 28 29 30 31 32
401 445 461 566 670 703 1000 1243 1975
33 34 35 36 37 38 39 40 41
2652 4840 7954 15874 33310 76794 109046 84890 23794
42 43 44 45 46 47 48 51
1931 133 32 6 5 5 2 1
> t6=tapply(new$DBWT,INDEX=list(cut(new$WTGAIN,breaks=10),cut(new$ESTGEST,breaks=10)),FUN=mean,na.rm=TRUE)
> t6
(12,15.9] (15.9,19.8] (19.8,23.7] (23.7,27.6]
(-0.098,9.8] 227 321.3125 486.7534 799.5614
(9.8,19.6] 2649 592.8235 546.7738 813.4179
(19.6,29.4] NA 585.8889 590.1368 882.4800
(29.4,39.2] 2977 1891.0000 731.5957 866.0294
(39.2,49] NA 2485.2500 803.8667 955.7639
(49,58.8] NA NA 434.7500 950.8039
(58.8,68.6] NA NA 352.0000 1285.6250
(68.6,78.4] NA NA NA 805.5714
(78.4,88.2] NA NA NA 1110.0000
(88.2,98.1] NA NA NA 768.0000
(27.6,31.5] (31.5,35.4] (35.4,39.3] (39.3,43.2]
(-0.098,9.8] 1398.234 2275.316 3166.748 3443.652
(9.8,19.6] 1421.181 2289.950 3171.085 3434.708
(19.6,29.4] 1452.186 2307.429 3213.362 3475.328
(29.4,39.2] 1521.757 2323.002 3276.400 3535.965
(39.2,49] 1513.215 2368.520 3329.068 3605.645
(49,58.8] 1506.355 2358.658 3370.630 3650.549
(58.8,68.6] 1469.508 2367.365 3389.672 3681.233
(68.6,78.4] 1463.391 2368.205 3418.076 3694.160
(78.4,88.2] 1487.846 2447.250 3496.495 3708.868
(88.2,98.1] 1434.333 2481.105 3406.835 3688.067
(43.2,47.1] (47.1,51]
(-0.098,9.8] 3911.667 3310
(9.8,19.6] 3206.400 NA
(19.6,29.4] 3007.800 3969
(29.4,39.2] 3326.143 4042
(39.2,49] 3447.200 NA
(49,58.8] 3501.000 NA
(58.8,68.6] 3435.500 NA
(68.6,78.4] 3118.000 NA
(78.4,88.2] NA NA
(88.2,98.1] NA NA
> levelplot(t6,scales = list(x = list(rot = 90)),xlab = 'ESTGEST',ylab = 'WTGAIN')
> contourplot(t6,scales = list(x = list(rot = 90)),xlab = 'ESTGEST',ylab ='WTGAIN')
> table(don$Class.Year) #由于1957年年代久远,有些毕业生已去世,故统计的人数少
1957 1967 1977 1987 1997
127 222 243 277 361
> barchart(table(don$Class.Year),horizontal=FALSE,xlab="Class Year",col="black")
> don$TGiving=don$FY00Giving+don$FY01Giving+don$FY02Giving+don$FY03Giving+don$FY04Giving
> mean(don$TGiving)
[1] 980.0436
> sd(don$TGiving)
[1] 6670.773
> quantile(don$TGiving,probs=seq(0,1,0.05))
0% 5% 10% 15% 20% 25%
0.0 0.0 0.0 0.0 0.0 0.0
30% 35% 40% 45% 50% 55%
0.0 10.0 25.0 50.0 75.0 100.0
60% 65% 70% 75% 80% 85%
150.8 200.0 275.0 400.0 554.2 781.0
90% 95% 100%
1050.0 2277.5 171870.1
> quantile(don$TGiving,probs=seq(0.95,1,0.01))
95% 96% 97% 98% 99%
2277.50 3133.56 5000.00 7000.00 16442.14
100%
171870.06
> hist(don$TGiving)
> hist(don$TGiving[don$TGiving!=0][don$TGiving[don$TGiving!=0]<=1000])