#2021/6/7 R语言复习检索
#数组 package23
myarry<-array(vector,dimensions,dimnames)
#列如
z<-array(1:24,c(2,3,4),dimnames = list(dim1,dim2,dim30))#中间的dimension中的2表示行数,3表示列数,4表示数组中矩阵的个数
#数据框 package24
#table表示连列表
patientid<-c(1,2,3,4)
admdata<-c("10/15/2009","11/01/2009","10/2/2009","10/28/2009")
age<-c(25,34,28,52)
diabetes<-c("type1","type2","type1","type1")
status<-c("poor","improved","excellent","poor")
patientdata<-data.frame(patientid,admdata,age,diabetes,status)
patientdata
table(patientdata$diabetes,patientdata$status)
#with函数 package26
with(mtcars,{
print(summary(mpg))
plot(mpg,disp)
})
#with函数中所有赋值都不生效,除非用<<-
with(mtcars,{
wsq<-198
})
wsq
#用<<-
with(mtcars,{
wsq<<-"对象很多"
})
wsq
#实例标识符 packages26
patientdata<-data.frame(patientid,age,diabetes,status,row.names = patientid)
patientdata
#因子 packages27
diabetes<-c("type1","type2","type1","type1")
diabetes<-factor(diabetes)#会将向量储存为1,2,1,1
diabetes
status<-c("poor","improved","excellent","poor")
status<-factor(status,ordered=T)#会将向量编码为3,2,1,3
#赋值编码
status<-factor(status,ordered = T,levels = c("poor","improved","excellent"))#人为赋值,将poor赋值为1,impro赋值为2
sex<-c(1,2)
sex<-factor(sex,levels = c(1,2),labels = c("male","female"))
sex
##使用键盘输入数据 packages31
mydata<-data.frame(age=numeric(0),gender=character(0),weight=numeric(0))
mydata<-edit(mydata)
mydata
fix(mydata)
##带分隔符的文本文件导入数据
#read.table("studentsid.csv",header=T,row.names="studentsid",sep=",",colclasses=c("character","numeric")) studentsid变成了行名
#倒入excel数据,这里用潘老师的readx1包
library(readxl)
read_excel("filename.xls", sheet = NULL, range = NULL, col_names = TRUE)
##读取stata数据packages38
library(foreign)
mydataframe<-read.dta("mydata.dta")
#数据集的标注 packages40
patientdata
names(patientdata)[2]<-"Age at hospitallzation"
patientdata[2]
ls()#显示当前的变量名称
#rm(object,object,...)删除一个或多个对象,rm(list=ls())删除当前工作环境中的所有对象
rm(age)
ls()
#基本数据管理
manager<-c(1,2,3,4,5)
date<-c("10/24/08","10/1/08","10/1/08","10/12/08","5/1/09")
country<-c("US","US","UK","UK","UK")
gender<-c("M","F","F","M","F")
age<-c(32,45,25,39,99)
q1<-c(5,3,3,3,2)
q2<-c(4,5,5,3,2)
q3<-c(5,2,5,4,1)
q4<-c(5,5,5,NA,2)
q5<-c(5,5,2,NA,1)
leadership<-data.frame(manager,date,country,gender,age,q1,q2,q3,q4,q5,stringsAsFactors = F)#stringsASfactor=T默认转为因子型
leadership
#变量的重编码
leadership$age[leadership$age==99]<-NA
leadership
#用within函数,within函数与with函数相似,不过它允许修改数据框
manager<-c(1,2,3,4,5)
date<-c("10/24/08","10/1/08","10/1/08","10/12/08","5/1/09")
country<-c("US","US","UK","UK","UK")
gender<-c("M","F","F","M","F")
age<-c(32,45,25,39,99)
q1<-c(5,3,3,3,2)
q2<-c(4,5,5,3,2)
q3<-c(5,2,5,4,1)
q4<-c(5,5,5,NA,2)
q5<-c(5,5,2,NA,1)
leadership<-data.frame(manager,date,country,gender,age,q1,q2,q3,q4,q5,stringsAsFactors = F)#stringsASfactor=T默认转为因子型
leadership<-within(leadership,{
agecat<-NA
agecat[age>75]<-"elder"
agecat[age>=55&age<=75]<-"Middle Aged"
agecat[age<55]<-"Young"
})
leadership
##变量的重命名
fix(leadership)
names(leadership)
names(leadership)[2]<-"testdate"
leadership
#namse(leadership)[2:6]<-c(.......0)
#用plyr包
library(plyr)
leadership<-rename(leadership,c(manager="managerid",testdate="date"))
leadership
##缺失值
#is.na()识别缺失值
#重编码缺失值
leadership$age[leadership$age==99]<-NA
leadership
#在分析中排除缺失值
x<-c(1,2,3,NA)
y<-sum(x,na.rm = T)
y
#删除不完整的观测
na.omit(leadership)
#日期值
today<-"2012/12/12"
tday<-as.Date(today,"%Y/%m/%d")
tday
Sys.Date()#返回当天的日期值
date()#返回具体时间
today<-Sys.Date()
WSQ<-as.Date("1998-12-22")
difftime(today,WSQ,units="weeks")
#数据排序
manager<-c(1,2,3,4,5)
date<-c("10/24/08","10/1/08","10/1/08","10/12/08","5/1/09")
country<-c("US","US","UK","UK","UK")
gender<-c("M","F","F","M","F")
age<-c(32,45,25,39,99)
q1<-c(5,3,3,3,2)
q2<-c(4,5,5,3,2)
q3<-c(5,2,5,4,1)
q4<-c(5,5,5,NA,2)
q5<-c(5,5,2,NA,1)
leadership<-data.frame(manager,date,country,gender,age,q1,q2,q3,q4,q5,stringsAsFactors = F)#stringsASfactor=T默认转为因子型
newdata<-leadership[order(leadership$age),]
newdata2<-leadership[order(leadership$gender,-leadership$age)]
newdata2
##数据集的合并
total<-merge(dataframeA,dataframeB,by=c("Id","country"))
#向数据添加行
total<-cbind(dataframeA,dataframeB)
#向数据添加列
total<-rbind(dataframeA,dataframeB)
#取子集
manager<-c(1,2,3,4,5)
date<-c("10/24/08","10/1/08","10/1/08","10/12/08","5/1/09")
country<-c("US","US","UK","UK","UK")
gender<-c("M","F","F","M","F")
age<-c(32,45,25,39,99)
q1<-c(5,3,3,3,2)
q2<-c(4,5,5,3,2)
q3<-c(5,2,5,4,1)
q4<-c(5,5,5,NA,2)
q5<-c(5,5,2,NA,1)
leadership<-data.frame(manager,date,country,gender,age,q1,q2,q3,q4,q5,stringsAsFactors = F)#stringsASfactor=T默认转为因子型
myvars<-paste("q",1:5,sep="")
newdata<-leadership[myvars]
newdata
##剔除向量
#第一种方法
myvars<-names(leadership)%in%c("q3","q4")
newdata3<-leadership[!myvars]
newdata3
#第二种方法
newdata4<-leadership[c(-8,-9)]
#第三种方法
leadership[8]<-leadership[9]<-NULL
leadership
##选入观测
leadership$date<-as.Date(leadership$date,"%m/%d/%y")
startdate<-as.Date("2009-01-01")
enddate<-as.Date("2009-10-31")
newdate<-leadership[leadership$date>=startdate&leadership$date<=enddate,]
newdate
#subset函数
newdata<-subset(leadership,age>=35|age<24,select = c(q1,q2,q3,q4));newdata
newdata2<-subset(leadership,age>35&gender=="M",select = gender:q4);newdata2
##随机抽样
mysample<-leadership[sample(1:nrow(leadership),3,replace=F),]
nrow(leadership)
##图形初阶packages43
setwd("C:/Users/wusuqi/Desktop/R语言脚本文件")
pdf("mygraph.pdf")
attach(mtcars)
plot(mpg,wt)
abline(lm(mpg~wt))
detach()
dev.off()
getwd()
plot(x,y,type="b",lty=2,pch=17)#type="b"同时绘制点和线,lty=2虚线,pch=17三角形
#pch指定绘制点时使用的符号,cex,指定符号的大小,lty指定线条的类型,lwd指定线条宽度。
##高级数据管理
z<-mean(x,trim=0.05,na.rm=T)#截尾平均数,去除左尾右尾各5%的数据之后,取平均值
z
#数据的标准化
newdata<-scale(mydata)#对数据或数据框进行均值为0,方差为1的标准化
newdata<-scale(mydata)*SD+M#对数据进行均值为M,标准差为SD的标准化
newdata<-transform(mydata,myvar=scale(myvar)*10+50)
#生成多元正态数据
install.packages("MASS")
library(MASS)
mvnorm(n,m,sigma)
library(MASS)
options(digits=3)
set.seed(13)
mean<-c(230.7,146.7,3.6)
sigma<-matrix(c(15360.8,6721.2,-47.1,6721.2,4700.9,-16.5,-47.1,-16.5,0.3),nrow=3,ncol = 3)
mydata<-mvrnorm(500,mean,sigma)
mydata<-as.data.frame(mydata)
names(mydata)<-c("y1","y2","y3")
dim(mydata)
head(mydata,7)
pretty(x,n)#将x划分为n个区间
##将函数应用于矩阵和数据框
apply(x,margin,fun)#margin=1表示行,margin=2表示列
options(digits=3)
student<-c("john davis","angela williams","bullwinkle moose","david jones","janice markhammer","chery1 cushing","reuven ytzrhak","greg knox","joel england","mary rayburn")
math<-c(502,600,412,358,495,512,410,625,573,522)
science<-c(95,99,80,82,75,85,80,95,89,86)
english<-c(25,22,18,15,20,28,15,30,27,18)
roster<-data.frame(student,math,science,english)
z<-scale(roster[,2:4])
z
score<-apply(z,1,mean)
score
roster<-cbind(roster,score)
roster
y<-quantile(roster$score,c(0.8,0.6,0.4,0.2))
y
roster$grade[score>=y[1]]<-"A"
roster$grade[score<=y[1]&score>=y[2]]<-"B"
roster$grade[score<=y[2]&score>=y[3]]<-"C"
roster$grade[score<=y[3]&score>=y[4]]<-"D"
roster$grade[score<=y[4]]<-"F"
roster
name<-strsplit(roster$student," ")
name
firstname<-sapply(name,"[",1)
lastname<-sapply(name,"[",2)
roster<-cbind(firstname,lastname,roster[,-1])
roster
roster[order(lastname,firstname),]
##重复和循环
#for结构
for (i in 1:10) print("hello")
#while结构
i<-10
while (i>0){print("hello");i<-i-1}
#条件执行
grade<-roster[7,]
if(is.character(grade)) grade<-as.factor(grade)
if(!is.character(grade)) grade<-as.factor(grade) else print("Grade already is a factor ")
class(grade)
ifelse(score>0.5,print("passed"),print("failed"))
#switch结构
cars<-mtcars[1:5,1:4]
cars
t(cars)#转置
mtcars
##整合数据packages105
options(digits = 3)
attach(mtcars)
adddata<-aggregate(mtcars,by=list(cyl,gear),FUN = mean,na.rm=T)
adddata
#resharpe2包
install.packages("reshape2")
ID<-c(1,1,2,2)
Time<-c(1,2,1,2)
X1<-c(5,3,6,2)
X2<-c(6,5,1,4)
mydata<-data.frame(ID,Time,X1,X2)
mydata
library(reshape2)
md<-melt(mydata,id=c("ID","Time"))
md
#重铸
newdata<-dcast(md,ID~variable,mean)
##基本统计分析packages132
#Hmisc包
install.packages("Hmisc")
library(Hmisc)
myvars<-c("mpg","hp","wt")
describe(mtcars[myvars])
#pastecs包
install.packages("pastecs")
library(pastecs)
stat.desc(mtcars[myvars])
#psych包
install.packages("psych")
library(psych)
describe(mtcars[myvars])
##通过sapply计算描述性统计量
mystatas<-function(x,na.omit=F){
if (na.omit)
x<-x[!is.na(x)]
m<-mean(x)
s<-sd(x)
n<-length(x)
kurt<-sum((x-m)^4/s^4)/n-3
skew<-sum((x-m)^3/s^3)/n
return(c(n=n,mean=m,kurtosis=kurt,skew=skew,sdev=s))
}
##分组进行描述性统计量 packages134
#aggregate函数
aggregate(mtcars[myvars],by=list(am=mtcars$am),mean)
#by()函数
dstatas<-function(x)sapply(x, mystatas)
by(mtcars[myvars],mtcars$am,dstatas)
##分组进行描述性统计的扩展packages135
#用doBy包
install.packages("doBy")
library(doBy)
#用doBy的summaBy函数
summaryBy(var1+var2+...~groupvar1,data=dataframe,FUN=function)
summaryBy(mpg+hp+wt~am,data=mtcars,FUN=mystatas)
#用psych中的describeBy
library(psych)
describeBy(mtcars[myvars],list(am=mtcars$am))
##频数表和联列表
library(vcd)
head(Arthritis,n=16)
#一维联列表
mytable<-with(Arthritis,table(Improved))
mytable
options(digits = 3)#转化为比例值
prop.table(mytable)
prop.table(mytable)*100#转化为百分比
#二维列表
mytable<-table(A,B)#基本公式
#公式风格的联列表
mytable<-xtabs(~Treatment+Improved,data=Arthritis)
mytable
margin.table(mytable,1)
prop.table(mytable,1)
margin.table(mytable,2)
prop.table(mytable,2)
#addmargins为这些表格添加边际和
addmargins(mytable)
addmargins(prop.table(mytable))
addmargins(prop.table(mytable,1),2)
addmargins(prop.table(mytable,2),1)
#使用CrossTable生成二维联列表
install.packages("gmodels")
library(gmodels)
attach(Arthritis)
CrossTable(Treatment,Improved)
#三维联列表
mytable<-xtabs(~Treatment+Sex+Improved,data=Arthritis)
mytable
ftable(mytable)
Arthritis
margin.table(mytable,c(1,3))
ftable(prop.table(mytable,c(1,2)))
ftable(addmargins(prop.table(mytable,c(1,2)),3))
ftable(addmargins(prop.table(mytable,c(1,2)),3))*100#得到百分比
#卡方独立性检验
library(vcd)
mytable<-xtabs(~Treatment+Improved,data = Arthritis)
chisq.test(mytable)#如果p<0.01说明两个变量不是独立的
#fisher精确检验
fisher.test(mytable)
#Cochran-Mantel-Haensze检验
mytable<-xtabs(~Treatment+Improved+Sex,data=Arthritis)#p<0.01,患者接受治疗与得到改善在性别的每一水平上并不独立
##相关性度量
#二维联列表的相关性度量
mytable<-xtabs(~Treatment+Improved,data=Arthritis)
assocstats(mytable)#较大的值意味的较大的相关性
states<-state.x77[,1:6]
cov(states)
cor(states)
cor(states,method = "spearman")#默认是皮尔森相关系数,这里换成SPEARMAN相关系数
x<-states[,c("Population","Income","Illiteracy","HS Grad")]
y<-states[,c("Life Exp","Murder")]
cor(x,y)
#偏相关 表示控制一个或多个变量时,另外两个定量变量之间的相互关系
#函数基本调用格式为:pcor(u,s),其中u是一个数值向量,前两个是需要计算相关系数的变量下标,后面是条件变量的下标。s是协方差阵
install.packages("ggm")
library(ggm)
colnames(states)
pcor(c(1,5,2,3,6),cov(states))
##检验某种相关系数的显著性
cor.test(x,y,alternative = ,method = )#alternative用来指定进行双侧检验或单侧检验,method指定要计算的相关类型
cor.test(states[,3],states[,5])
#通过corr.test计算相关矩阵并进行显著性检验
library(psych)
corr.test(states,use="complete")
##t检验
#独立样本的t检验
library(MASS)
t.test(Prob~So,data=UScrime)
UScrime
#非独立样本的t检验
#基本调用格式:t.test(y1,y2,paried=T)
library(MASS)
sapply(UScrime[c("U1","U2")],function(x)(c(mean=mean(x),sd=sd(x))))
with(UScrime,t.test(U1,U2,paired = T))
state.x77
#检验回归分析中统计假设的方法
fit<-lm(weight~height,data=women)
par(mfrow=c(2,2))
plot(fit)
help("qqplot")
#qqplot检验回归分析中统计假设
#正态性
library(car)
states<-as.data.frame(state.x77[,c("Murder","Population","Illiteracy","Income","Frost")])
fit<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
qqPlot(fit,labels=row.names(states),id.method="identify",simulate = T,main="QQ Plot")
#检验误差的序列相关性
durbinWatsonTest((fit))
#线性
library(car)
crPlots(fit)
#同方差性,若显著则说明存在异方差
library(car)
ncvTest(fit)#结果显著,同方差
spreadLevelPlot((fit))
#线性模型假设的综合验证
install.packages("gvlma")
library(gvlma)
gvmodel<-gvlma(fit)
summary(gvmodel)
#多重共线性,用统计量vif进行检验,一般来说sqrt(vif)>2就说明存在多重共线性问题
library(car)
vif(fit)
##异常值观测
#离群点
library(car)
outlierTest(fit)
##改进措施
#变量变换
library(car)
summary(powerTransform(states$Murder))
#用muder^0.6代替murder使用,提高模型的正态性
library(car)
boxTidwell(Murder~Population+Illiteracy,data=states)
#变换Population^0.87和Illiteracy^1.36能够大大改善线性关系
##选择最佳的回归模型 packages189
#模型比较 用anova函数
states<-as.data.frame(state.x77[,c("Murder","Population","Illiteracy","Income","Frost")])
fit1<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
fit2<-lm(Murder~Population+Illiteracy,data=states)
anova(fit1,fit2)#P值不显著,不需要将这两个变量添加到线性模型中
#AIC赤池信息准则AIC较小的模型要优先选择,用AIC来比较模型
fit1<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
fit2<-lm(Murder~Population+Illiteracy,data=states)
AIC(fit1,fit2)
#变量选择 逐步回归
#向后回归
library(MASS)
states<-as.data.frame(state.x77[,c("Murder","Population","Illiteracy","Income","Frost")])
fit<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
stepAIC(fit,direction = "backward")#有缺陷
#所以有了更好的方法 全子集回归
library(leaps)
states<-as.data.frame(state.x77[,c("Murder","Population","Illiteracy","Income","Frost")])
leaps<-regsubsets(Murder~Population+Illiteracy+Income+Frost,data=states,nbest=4)
plot(leaps,scale = "adjr2")
#相对重要性
states<-as.data.frame(state.x77[,c("Murder","Population","Illiteracy","Income","Frost")])
zstates<-as.data.frame(scale(states))
zfit<-lm(Murder~Population+Illiteracy+Income+Frost,data=zstates)
coef(zfit)
#相对权重法
states<-as.data.frame(state.x77[,c("Murder","Population","Illiteracy","Income","Frost")])
fit<-lm(Murder~Population+Illiteracy+Income+Frost,data=states)
relweights(fit,col="blue")
##广义线性模型
#logistic回归
install.packages("AER")
library(AER)
data(Affairs)
summary(Affairs)
help(data)
Affairs
options(max.print =10000)
Affairs
#将affairs转化为二值因子型
Affairs$ynaffairs[Affairs$affairs>0]<-1
Affairs$ynaffairs[Affairs$affairs==0]<-0
Affairs$ynaffairs<-factor(Affairs$ynaffairs,levels = c(0,1),labels = c("NO","YES"))
table(Affairs$ynaffairs)
fit.full<-glm(ynaffairs~gender+age+yearsmarried+children+religiousness+education+occupation+rating,data=Affairs,family = binomial())
summary(fit.full)
fit.reduced<-glm(ynaffairs~age+yearsmarried+religiousness+rating,data=Affairs,family = binomial())
summary(fit.reduced)
#用anova函数进行检验
anova(fit.full,fit.reduced,test="Chisq")#结果不显著,说明添加变量不会显著提高方程的预测精度
#解释模型参数
coef(fit.reduced)
exp(coef(fit.reduced))#对数优势比
##生成时间序列对象
#基本调用格式:myseries<-ts(data,start=,end=,frequency=)#frequency=1对应年度数据,frequency=12对应月度数据
###处理缺失数据 packages382
install.packages(c("VIM","mice"))
#识别缺失值
data(sleep,package="VIM")
sleep[complete.cases(sleep),]#列出没有缺失值的行
sleep[!complete.cases(sleep),]#列出有缺失值的行
#逻辑值True和false分别等价于数值1和0,可用sum()和mean()函数来获取关于缺失数据的有用的信息
sum(is.na(sleep$Dream))
mean(is.na(sleep$Dream))
#表示dream变量有12个缺失值,19%的实例在此变量上有缺失值
#图形探究缺失值 用aggr这个函数
library("VIM")
aggr(sleep,prop=F,numbers=T)#选项numbers=F默认删除数值型标签
#行删除
mydata<-mydata[complete.cases(mydata),]
newdata<-na.omit(mydata)
setwd("C:/Users/wusuqi/Desktop/R语言脚本文件")
save.image("2021.6.11")
getwd()
#dplyr包
library(dplyr)
iris
head(iris)
filter(iris,Sepal.Length>7)#过滤
dplyr::distinct(rbind(iris[1:10,],iris[1:15,]))#去除重复
dplyr::slice(iris,10:15)#切片
dplyr::sample_n(iris,10)#随机取样
dplyr::sample_frac(iris,0.1)#按比例随机选取
dplyr::arrange(iris,Sepal.Length)#排序
dplyr::arrange(iris,desc(Sepal.Length))#相反的方向排序
#dplyr的统计函数
summarise(iris,avg=mean(iris$Sepal.Length))
#链式操作符%>%,实现将一个函数的输出传入给下一个函数作为下一个函数的输入,快捷键ctrl+shift+M
head(mtcars,20) %>% tail(10)
dplyr::group_by(iris,Species)
iris %>% group_by(Species)#和上面一条脚本文件一样
iris %>% group_by(Species) %>% summarise(avg=mean(Sepal.Length))
iris %>% group_by(Species) %>% summarise(avg=mean(Sepal.Length)) %>% arrange(avg)
dplyr::mutate(iris,newdata= Sepal.Length+Sepal.Width)#合并
iris <- iris %>%
group_by(Species) %>%
mutate(across(where(is.numeric),
~if_else(condition = is.na(.),
true = mean(., na.rm = T),
false = as.numeric(.))))