R笔记(简单数据处理)

R语言笔记

#设定R软件当前工作目录
setwd("E:/R work")
#显示R软件当前工作目录
getwd()
#R语言数据预处理常用包安装
#plyr,reshape2,lubridate, stringr
install.packages(c("plyr","reshape2","lubridate", "stringr","foreign"))
library(MASS)
library(foreign)
library(stringr)
library(plyr)
library(reshape2)
library(ggplot2)
#####1.R语言数据读取#####
#R包自带数据
data(diamonds)
diamonds
#查看前六行数据
head(diamonds)
#查看后六行数据
tail(diamonds)

#R添加包,可以从一些开放源直接下载金融数据,包括雅虎财经、谷歌财经、等
install.packages("quantmod")
library(quantmod) #加载包

#从雅虎财经下载苹果股票交易数据(从2015年1月1日至今)
getSymbols("AAPL",from="2015-01-01")  
#查看数组维数及元素个数
dim(AAPL) 
head(AAPL) 
tail(AAPL)  
#作图,K线图
chartSeries(AAPL,theme=chartTheme('black'))   

#从oanda获取外汇数据
install.packages("jsonlite")
library(jsonlite)
getFX("USD/CNY",from="2017-05-01")  
head(USDCNY)
tail(USDCNY)
chartSeries(USDCNY,theme = chartTheme('black'))

#read.table函数读取本地/网络数据(read.table, read.csv, read.csv2, read.delim, read.delim2, read.fwf)
help("read.table")
##read.table函数

getwd()
#原始数据有列名,第一列为记录序号,可以省略参数header(但此时应当为TRUE)
rt = read.table("houses.data");rt  
rt1 = read.table("houses.data",header = TRUE);rt1 

#原始数据有列名,无记录序号列,不可以省略参数header
rt2 = read.table("houses2.data",header = TRUE);rt2  
rt2 = read.table("houses2.data");rt2  # 省略参数header(此时为FALSE),变量名会被认为是一行数据

#原始数据无列名,无记录序号列,可以省略参数header(此时为FALSE)
rt3 = read.table("houses3.data");rt3  
rt3 = read.table("houses3.data",
                 col.names = c("Price","Floor","Area","Rooms","Age","Cent.heat"));rt3 

#read.csv函数
dat = read.csv('PM.csv')  #编码错误,读入乱码,行数也会错乱
dat1 = read.csv('PM.csv',fileEncoding = "utf-8")  #指定正确编码

#以下操作不读取表头,并重新制定列名
colname=c('id','city','index','y','x')
dat2 = read.csv('PM.csv',header=FALSE,col.name=colname,fileEncoding = "utf-8")

#当数据量较大时,全部将数据读取会比较耗时,这里可以通过nrows设定
dat3 = read.csv('PM.csv',fileEncoding = "utf-8",nrows=-1)  #nrows默认为-1
dat4 = read.csv('PM.csv',fileEncoding = "utf-8",nrows=5)  #nrows设置为5

#因子转换
dat5 = read.csv('PM.csv',stringsAsFactors=FALSE,fileEncoding = "utf-8") #读取为string格式
str(dat5)
dat6 = read.csv('PM.csv',fileEncoding = "utf-8") #读取为factor格式
str(dat6)

#文件编码
dat7 = read.csv('PM.csv',fileEncoding = "utf-8") #默认编码不是utf-8,需要设置
dat8 = read.csv('PM-gbk.csv') #这里默认编码是gbk,不需要设置

#最后一行没有回车符会有警告“最后一行不完整”
x=read.table("data1.txt",sep=",");x
person=read.csv("data1.txt", header=FALSE,col.names=c("age","height"))
person
##scan函数读取结构化数据
#15名学生的体重
w = scan("weight.data");w  #默认读为数值向量
w = scan("weight.data",what = 0);w  
w = scan("weight.data",what = c(""));w  #读为字符型向量
w = scan("weight.data",what = list(""));w  #读为list

#例100名学生的身高和体重被存在文件h_w.data中,其中1,3,5,7,9列为身高,2,4,6,8,10列为体重,
#试用scan函数读入,并转化为数据框
dat = scan("h_w.data",what = list(height=0,weight=0))
df = as.data.frame(dat)

#scan函数读入屏幕数据
names = scan(what = "")
zhangsan lisi wangwu maliu
names

##其他格式数据读入
install.packages("foreign")
library(foreign)
#读取SPSS文件,不加参数to.data.frame = T返回list
educ = read.spss("educ_scores.sav",to.data.frame = T)
educ = read.xport("educ_scores.xpt")  #读取SAS文件
educ = read.S("educ_scores")  #读取SPLUS文件
educ = read.dta("educ_scores.dta")  #读取stata文件

#读取excel表格数据
educ = read.delim("EDUC_SCORES.txt")  #转化为txt文件
educ = read.csv("educ_scores.csv")    #转化为csv文件
#利用xlsx包中的函数读取
install.packages("xlsx")
library(xlsx)

#解决无法载入‘rJava’问题方法
install.packages("rJava")
Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_77')  #自己的JAVA64路径
library(rJava)
library(xlsx)
#这里默认header=T,sheetIndex = 1表示读取第一个工作簿的数据,或通过指定工作簿名称来读取
educ = read.xlsx("educ_scores.xls",sheetIndex = 1)  
educ = read.xlsx("educ_scores.xls",sheetName = "educ_scores")  

##文本数据读取
news = readLines('news.txt',encoding = "UTF-8")
news = readLines('news.txt',n=2,encoding = "UTF-8");news

#scan函数读取为列表
line = scan('news.txt',what=list(''),encoding = "UTF-8")
line = scan('news.txt',what=list(''),n=1,encoding = "UTF-8");line
#scan函数读取为向量
line = scan('news.txt',what=c(''),encoding = "UTF-8")
line = scan('news.txt',what=c(''),n=1,encoding = "UTF-8");line

##结构化数据写入
write.table(educ,file = "educ_w.txt",append = T)
write.csv(educ,file = "educ_w.csv")

##文本数据写入
writeLines(line,"news_w.txt")

sink("news_w1.txt")
cat(line)
sink()

y=read.table("http://www.jaredlander.com/data/Tomato%20First.csv",header=TRUE,sep=",")
#使用head(),str(),summary()函数来查看数据集
head(y)
str(y)
summary(y)

getwd()
#查看数据
data = read.table("salary.txt",header = T);data
mode(data)
class(data)
names(data)
colnames(data)
dim(data)

#####2.数据管理与变换######
##数据合并
a=c("Hongkong",1910,75.0,41.8)

data = read.table('salary.txt', header = T,stringsAsFactors = F)
data1=rbind(data,a)
data1[14:16,]

weight=c(150,135,210,140)  #数值型向量
height=c(65,61,70,65)
gender=c("F","F","M","F")  #字符型向量
stu=data.frame(weight,height,gender)
row.names(stu)=c("Alice","Bob","Cal","David")
stu[,"weight"]
stu["Cal",]  #获取行
stu[1:2,1:2]
stu$weight  # ”$”用于取列
stu[["weight"]]  #双括号+名称
stu[[1]]  #双括号+下标,用于数据框和列表数据的获取
stu[,1]
#列名一致
index=list("City"=data$City,"Index"=1:15)  #建立另一个数据集index
index
data.index=merge(data,index,by="City")
data.index

#列名不一致
index1=list("City1"=data$City,"Index"=1:15)
index1
data.index1=merge(data,index1,by.x = "City",by.y = "City1")
data.index1
index2 = 1:15
data.index2=cbind(data,index2)


##选取数据子集
data[data$Salary>65,]
data[c(2,4),]
#选取价格指数等于65.6的行,注意要用双等号==
data[data$Price==65.6,]

##数据排序
order.salary=order(data$Salary)  #返回的是该变量从小到大(默认)排序后的索引
order.salary
data[order.salary,]

sort.list(data$Salary)  #sort.list与order的作用一致
data[sort.list(data$Salary,decreasing = T),]

## 读取数据
# 在当前目录下搜索匹配文件名中有“Loan”的贷款申请成功数据
setwd("G:\\数据预处理")
thefilesL = dir(pattern = "^Loan");thefilesL
# 读入各数据并将其放在同一个列表(list),若不指定参数stringsAsFactors = F,字符型的元数据将自动转化为因子型
# lapply函数对列表进行统一操作(R语言基础PPT54)
# 第一行是描述性数据,需要跳过
LoanList0 = lapply(thefilesL, read.csv, stringsAsFactors = F, skip = 1)
# 按行合并不同的csv文件的贷款申请数据
Loan = do.call(rbind, LoanList0)
# 同样读入、合并Reject
thefilesR = dir(pattern = "^Reject")
RejectList0 = lapply(thefilesR, read.csv, stringsAsFactors = F, skip = 1)
Reject = do.call(rbind, RejectList0)
str(Loan)

library(dplyr)
Loan.df = tbl_df(Loan)
Loan.df
dim(Loan.df)
colnames(Loan.df)
## (1)添加新变量列
# 添加一列名为dti的新变量,它是将变量列Debt.To.Income.Ratio去百分号得到的
#这里sub函数用来将“%”替换为“”
Reject.temp = mutate(Reject, dti = as.numeric(sub("%", "", Debt.To.Income.Ratio)))
#等同于下面的操作
Reject.temp1 = Reject
Reject.temp1$dti = as.numeric(sub("%", "", Reject.temp1$Debt.To.Income.Ratio))
## (2)选择变量列
Reject.s = select(Reject.temp, Amount.Requested, dti, Risk_Score:State)
Reject.s1 = select(Reject.s, -Zip.Code, -Debt.To.Income.Ratio)
## (3)选择满足条件的观测行
MA_Reject = filter(Reject.s1, Risk_Score>500&State == "MA")
## (4)排序
arrange(Reject.s1, State, Risk_Score, dti, Amount.Requested)
## (5)数据分组汇总
summarise(group_by(Loan,grade),                       #使用分类变量grade分组
          ave.amnt = mean(funded_amnt, na.rm = T),    #计算均值
          sd = sd(funded_amnt, na.rm = T),            #计算标准差
          n = sum(!is.na(funded_amnt)),               #计算各组样本量(不计缺失值)
          se = sd/sqrt(n),                            #计算均值标准误
          conf_upper = ave.amnt + qt(0.975, n-1)*se,  #计算置信上下限(t分布)
          conf_lower = ave.amnt - qt(0.975, n-1)*se)
## 数据变换
# (1)最大值-最小值规范化
library(caret)
# 将Loan数据中的loan_amnt转化到[0,1]
help(preProcess)
#先指定处理方法
trans = preProcess(select(Loan, loan_amnt), method = c("range"))
trans
#然后使用predict函数完成处理
transformed = predict(trans, select(Loan,loan_amnt))
head(transformed)
range(transformed)

# (2)标准化
trans = preProcess(select(Loan, loan_amnt), method = c("center","scale"))
trans
transformed = predict(trans, select(Loan,loan_amnt))
head(transformed)
mean(transformed[[1]]);var(transformed[[1]])
# (3)十进制正规化
max(abs(Loan$loan_amnt))
# (4)Box-Cox变换
library(e1071)
#计算偏度,发现是右偏
skewness(Loan$annual_inc,na.rm = T)
#选择Loan数据集中的数值型变量
Loan.num = select(Loan, loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,dti,total_pymnt)
# 对每列数值型变量都计算其偏度系数
apply(Loan.num,2,skewness,na.rm = T)
# 为了直方图显示效果,剔除年收入超过40万美元的客户
Loan.anin = Loan$annual_inc[-which(Loan$annual_inc>400000)]
library(caret)
# 使用样本数据估计λ,估计值为-0.1,但修正后的λ估计值为0
BoxCoxTrans(Loan$annual_inc,na.rm = T)
par(mfrow=c(1,2))
hist(Loan.anin,xlab="natural units of annual_inc", main="Histogram: Original Data")
# 估计的λ为0,使用log变换
hist(log(Loan$annual_inc),
     xlab = "log units of annual_inc",
     main = "Histogram: after log transformation"
     )


#####3.从原始数据到技术正确的数据#####
##一个小案例(deltons)
#step(1): Reading data
txt=readLines("data2.txt")  #readLines: when the rows in a data files are not uniformly formatted
txt
#step(2):Selecting lines containing data
I=grepl("^%",txt)
I
dat=txt[!I]
dat
#step(3):Split lines into separate fields
help(strsplit)
(fieldList=strsplit(dat,split=","))
#step(4):Standardize rows
#先定义一个对列表中单个元素处理的
assignFields=function(x)  #函数声明
{
  out=character(3)
  #匹配list中的字符作为输出的第一列
  i=grepl("[[:alpha:]]",x)
  #print(i)
  out[1]=x[i]
  #将list中小于1890的作为出生年份
  i=which(as.numeric(x)<1890)
  #print(i)
  out[2]=ifelse(length(i)>0,x[i],NA)  #若长度不大于0,则赋值为NA
  #将list中大于1890的作为死亡年份
  i=which(as.numeric(x)>1890)
  #print(i)
  out[3]=ifelse(length(i)>0,x[i],NA)  #若长度不大于0,则赋值为NA
  return(out)
}
#演示
out=character(3)
out[1]
i=grepl("[[:alpha:]]",fieldList[[1]]);i
out[1] = fieldList[[1]][i];out

i=which(as.numeric(fieldList[[1]])<1890);i
out[2]=ifelse(length(i)>0,fieldList[[1]][i],NA);out

#lapply函数用来处理列表的每一个元素
standardFields=lapply(fieldList,assignFields) #apply a function over a list
standardFields
#step(5): transform a list to data.frame(将list转化为data.frame)
M=matrix(unlist(standardFields),nrow=length(standardFields),byrow=TRUE)  #copy into a matrix which is then coerced into a data.frame
#unlist() produce a vector which contains all the atomic components which occur in x
colnames(M)=c("name","birth","death")
M
deltons=as.data.frame(M,stringsAsFactors=FALSE)  #stringsAsFactors=FALSE 防止R把第一列默认成因子模式factor
deltons
#step(6):Normalize and coerce to correct types(强制转换类型)
str(deltons)
deltons$birth=as.numeric(deltons$birth)
deltons$death=as.numeric(deltons$death)
deltons
str(deltons)

##分类变量处理
#分类型变量在R中存储为factor格式
#(1)改变因子水平排序
f=factor(c("small","large","large","small","medium")); f
levels(f) #默认是字母表顺序
#手动输入改变
f1=factor(f,levels=c("small","medium","large")); f1
#rev函数逆转原来的排序
f2=factor(f1,levels=rev(levels(f1))); f2 
#relevel函数决定因子水平从哪一个开始
f3 = relevel(f2,ref="small"); f3

##根据数值型变量改变因子水平排序,函数:reorder 
iss=InsectSprays  #R包数据:昆虫喷雾剂
iss
#未重新排序前画箱线图,按照默认顺序排序
iss$spray
boxplot(count~spray,data=iss)   #箱线图
#重新排序后箱线图按照count的均值从小到大排序
iss$spray=reorder(iss$spray,iss$count,FUN=mean)
iss$spray
boxplot(count~spray,data=iss)   #箱线图
relevel(iss$spray,ref="D")

#(2)因子水平重编码
#Example: we read in a vector where 1 stands for male, 2 stands for female and 0 stands for unknown
gender=c(2,1,1,2,0,1,1)
gender=factor(gender,level=c(1,2),label=c("male","female"))
gender

library(ggplot2)
(pg=PlantGrowth)  #ggPlot2数据
pg$group  #原来的分类有3类
pg$treatment[pg$group=="ctrl"]="no"
pg$treatment[pg$group=="trt1"]="yes"
pg$treatment[pg$group=="trt2"]="yes"
pg
str(pg)
pg$treatment=factor(pg$treatment)
str(pg)

##字符处理
#(1).string normalization: transform a varity strings to a set of standard strings 
#We expect it to be more easily processed later

library(stringr)
str_trim(" Hello world ")  #忽略前后空格
str_trim(" Hello world ",side="left")  #忽略左边空格
str_trim("Hello world ",side="right")  #忽略右边空格
str_pad(112,width=10,side="left",pad=0)  #把字符串填充为指定的长度
toupper("Hello world") #小写字母转化为大写字母(to-upper)
tolower("Hello world") #大写字母转化为小写字母(to-lower)
#(2)模糊匹配
#模式匹配
gender=c("M","male","Female","fem.");gender
#grepl返回逻辑值,grep返回匹配到的位置索引
grepl("m",gender)    #大小写敏感,返回逻辑值
grep("m",gender)   #大小写敏感,返回数值索引
grepl("m",gender,ignore.case=TRUE) #参数ignore.case=TRUE,忽略大小写
grepl("m",tolower(gender)) 
#匹配以m或M开头的字符串
grepl("^m",gender,ignore.case=TRUE)

#查看“abc“变为”bac”需要的步数(不能换位,只能替换)
adist("abc","bac")  
codes=c("male","female")
disMatrix=adist(gender,codes)
disMatrix
colnames(disMatrix)=codes  #for readability
rownames(disMatrix)=gender
disMatrix
i=apply(disMatrix,1,which.min);i  #按行输出变换结果
data.frame(rawtext=gender,coded.gender=codes[i]) 
#stringdist()在计算字符串距离时比adist()更加方便,它允许字符的替换
install.packages("stringdist")
library(stringdist)
stringdist("abc","bac")
#amath() return an index to the closest match(codes) within a maximum distance
i=amatch(gender,codes,maxDist=4);i
data.frame(rawtext=gender,code=codes[i])

##日期转化
(current_time=Sys.time())
class(current_time)
as.numeric(current_time)
date1=as.Date(current_time)
date1
as.numeric(date1)
end_time=Sys.time()
end_time-current_time #Running time of some program
install.packages("lubridate")
library(lubridate)  
#contain functions facilitating conversion of text to POSIXct date
dates=c("15/02/2013","15022013","01-07-2011","It happened on 15 02 13")
dmy(dates) #dmy转换为标准格式

##分组操作
#(1)apply(),lapply(),sapply(),mapply()
(ma=matrix(1:100,nrow=20))
#按行求和,等同于rowSums()
apply(ma,1,sum)
#按列求和,等同于colSums()
apply(ma,2,sum)
#添加缺失值的情况
ma[2,3]=NA
apply(ma,1,sum)
apply(ma,2,sum)
apply(ma,1,sum,na.rm=TRUE)
apply(ma,2,sum,na.rm=TRUE)

Thelist=list(A=matrix(1:9,nrow=3),B=1:5,C=matrix(1:4,nrow=2),D=c(2));Thelist
lapply(Thelist,sum)
sapply(Thelist,sum)
help(apply)

#(2)aggregate()
library(ggplot2)
data(diamonds)
diamonds
head(diamonds)
aggregate(price~cut,diamonds,mean)
aggregate(price~cut+color,diamonds,mean)
aggregate((price+carat)~cut+color,diamonds,mean)

#(3)plyr Package
library(plyr)
xx <- array(1:24, c(3, 4, 2));xx
class(xx)

#matrix
a=matrix(1:21,nrow=3,ncol=7);a
aaply(.data=a,.margins=1,.fun=mean)  #计算矩阵a各行均值
aaply(a,1,mean)  #计算矩阵a各行均值
aaply(a,2,mean)  #计算矩阵a各列均值

#data.frame
names=c("John","Mary","Alice","Peter","Roger","Phyillis") 
age=c(13,15,14,13,14,13) 
sex=c("Male","Female","Female","Male","Male","Female") 
data=data.frame(names,age,sex);data

aver=function(data)c(average.age=mean(data$age))
dlply(data,"sex",aver)  #返回列表
ddply(data,"sex",aver)  #返回数据框
daply(data,"sex",aver)  #返回向量

##baseball简单案例
#Case study:  data(baseball)
#baseball数据集包括了15年及以上美国所有职业选手的击球记录
data(baseball)
head(baseball)
baseball[baseball$id=="yosted01",]  #输出id为“yosted01”的信息
#新增变量: OBP(On-Base Percentage,上垒率)
#OBP=(h+bb+hbp)/(ab+bb+hbp+sf)
baseball$sf[baseball$year<1954]  #查看year<1954的sf值
baseball$sf[baseball$year<1954]=0  #将year<1954的sf值赋值为0
baseball$hbp[is.na(baseball$hbp)]=0  #set missing values to 0
#检查是否存在缺失值
any(is.na(baseball$sf))
any(is.na(baseball$hbp))
#每年、每位选手的OBP值
#with()函数用来做批处理
baseball$OBP=with(baseball,(h+bb+hbp)/(ab+bb+hbp+sf))
tail(baseball)
#计算选手职业生涯中的OBP值
#OBP=sum(h+bb+hbp)/sum(ab+bb+hbp+sf)
obp=function(data) c(OBP=with(data,sum(h+bb+hbp)/sum(ab+bb+hbp+sf)))
obp(baseball[baseball$id=="aaronha01",])
careerOBP=ddply(baseball,"id",obp)
head(careerOBP)
arrange(careerOBP,OBP)  #排序

##整齐数据
#(1)列标题是值而不是变量名
#pew数据是教徒的收入数据,分隔符是"\t"

pew = read.delim(file = "pew.txt",header = TRUE,stringsAsFactors = FALSE,check.names = F)
pew
library(reshape2)
pew_tidy = melt(data = pew,id.vars = "religion",variable.name="income",value.name="frequency")
head(pew_tidy)

#(2)多个变量存储在一列
tb = read.csv(file = "tb.csv",header = TRUE, stringsAsFactors = FALSE)
head(tb)
names(tb)
tb$new_sp = NULL  #clean up column names
names(tb)
names(tb) = gsub("new_sp_", "", names(tb))

# na.rm = TRUE移除缺失值
tb_tidy = melt(data = tb,id = c("iso2", "year"),variable.name = "gender_age",
               value.name = "cases",na.rm = TRUE)
#gender_age这一列包含两个变量:性别和年龄段
head(tb_tidy)
# na.rm = TRUE可以保证按变量排序不受影响
tidy = arrange(tb_tidy, iso2, gender_age, year)
head(tidy)
library(stringr)
#str_sub()用来从一个特征向量提取子字符串(stringr)包
#str_sub(string=,start=,end=)
str_sub(tidy$gender_age, 1, 1) 
str_sub(tidy$gender_age, 2)
ageraw=str_sub(tidy$gender_age, 2)
agemap= c("04" = "0-4", "514" = "5-14",
          "014" = "0-14", "1524" = "15-24", "2534" = "25-34",
          "3544" = "35-44", "4554" = "45-54", "5564" = "55-64",
          "65"= "65+", "u" = NA)
#revalue()函数作用:对于一个因子型或者字符型变量,给定一个映射关系,用新值替换指定值
age=revalue(ageraw,agemap)

tidy$sex = str_sub(tidy$gender_age, 1, 1)
tidy$age = factor(age)
tidy = tidy[c("iso2", "year", "sex", "age", "cases")]
head(tidy)

#(3)行、列中均存在变量
#weather是天气气温的数据
weather = read.delim(file = "weather.txt",stringsAsFactors = FALSE)
head(weather)
raw1=melt(weather,id.vars=c("id","year","month","element"),
          na.rm = TRUE, variable.name="day",value.name = "temperature")
head(raw1)
#str_replace()函数将变量“day”中的“d”用“”代替,即去掉
raw1$day = as.integer(str_replace(raw1$day, "d", ""))
#tolower()函数将变量“element”中的值转化为小写
raw1$element = tolower(raw1$element)
names(raw1)
#交换两变量的顺序
raw1 = raw1[c("id", "year", "month", "day","element", "temperature")]
raw1 = arrange(raw1, year, month, day, element)
head(raw1)
dcast(raw1,id+year+month+day~element,value.var="temperature")

#####4.修改数据#####

data = read.table("salary.txt",header = T);data

mode(data)
names(data)
dim(data)
data$Price
attach(data)
Price
Salary
mean(Salary)  #求均值
length(Salary)  #数据长度(个数)
cumsum(Salary)  #累积工资
detach(data)
Salary

#修改数据标签
names(data)=c("CITY","WORK","PRICE","SALARY")
names(data)
#行列删除
data2=data[-1,-3]
data2
#判断缺失数据
attach(data)
is.na(SALARY)
#将data文件中工资指数大于65的值替换为缺失值
data$SALARY = replace(SALARY,SALARY>65,NA)
is.na(SALARY)
#查看缺失值数量
sum(is.na(SALARY))
#complete.cases()函数
complete.cases(data$SALARY)  #数据是否非缺失
sum(!complete.cases(data$SALARY))
#判断缺失模式
data$PRICE = replace(PRICE,PRICE>80,NA)
install.packages("mice")
library(mice)
md.pattern(data)

install.packages("VIM")
library(VIM)
aggr(data)


##(1)行删除法
data("airquality")
head(airquality)
tail(airquality)
sum(any(is.na(airquality)))
airquality[complete.cases(airquality),]
##(2)成对删除法
apply(airquality,2,mean,na.rm=TRUE)  #均值
cor(airquality,use="pair")  #相关系数矩阵
##(3)用统计量来填补缺失值
mean6 = apply(airquality,2,mean,na.rm = TRUE);mean6
#TRUE/FALSE"+1"是为了使得值为TRUE的变为2,值为FALSE的变为1,观察是否插补标识
airquality$col = c("Mean_imputation","notNA")[complete.cases(airquality[,1:2])+1]
#使用均值插补两个变量
airquality[is.na(airquality$Ozone),"Ozone"] = mean6["Ozone"]
airquality[is.na(airquality$Solar.R),"Solar.R"] = mean6["Solar.R"]
#检查插补后是否有缺失值
any(is.na(airquality))
#绘制插补后的Ozone直方图
library(ggplot2)
ggplot(airquality,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#绘制插补后的Solar.R和Ozone的散点图
ggplot(airquality,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)
#插补后的标准误
sd(airquality$Ozone)
#插补后Solar.R和Ozone的相关系数
cor(airquality$Ozone,airquality$Solar.R)
#重新加载airquality
data("airquality")
#插补前Ozone的标准误
sd(airquality$Ozone,na.rm = TRUE)
#插补前Solar.R和Ozone的相关系数
cor(airquality$Ozone,airquality$Solar.R,use = "complete.obs")

##(4)回归插补
library(mice)
data("airquality")
airquality$col = c("regression_imputation","notNA")[as.vector(!is.na(airquality["Ozone"]))+1]
fit = lm(Ozone~Solar.R,data = airquality)
#筛选Ozone缺失的行号
a = which(!complete.cases(airquality$Ozone))  
#插补
airquality$Ozone[a] = as.vector(predict(fit,newdata = airquality[a,]))
ggplot(airquality,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#绘制插补后的Solar.R和Ozone的散点图
ggplot(airquality,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)
#插补后的标准误
sd(airquality$Ozone,na.rm=TRUE)
#插补后Solar.R和Ozone的相关系数
cor(airquality$Ozone,airquality$Solar.R,use = "complete.obs")

##(5)随机回归插补
library(mice)
data("airquality")
imp = mice(airquality[,1:2],method = "norm.nob",m=1,maxit = 1,seed = 11)
air = complete(imp)
air$col = c("norm.nob_imputation","notNA")[complete.cases(airquality[,1:2])+1]
ggplot(air,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#绘制插补后的Solar.R和Ozone的散点图
ggplot(air,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)

##(6)多重插补
library(mice)
data("airquality")
imp = mice(airquality,seed = 1,print = FALSE)
#使用with()函数依次对每个完整数据集做回归
fit = with(imp,lm(Ozone~Wind+Temp+Solar.R))
pooled = pool(fit)
round(summary(pooled),3)[,c(1:3,5)]
#使用原数据集做回归
fit.r = lm(Ozone~Wind+Temp+Solar.R,data=airquality)
round(coef(summary(fit.r)),3)

#观察实际插补值
imp$imp
#显示实际插补值的得变量Ozone的值,5列表示5个值
imp$imp$Ozone
#complete()函数可以观察m个插补数据集中的任何一个
air = complete(imp,action = 1)
air$col = c("multiple_imputation","notNA")[complete.cases(airquality[,1:2])+1]
ggplot(air,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#绘制插补后的Solar.R和Ozone的散点图
ggplot(air,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)

##(7)K近邻法
install.packages("DMwR")
library(DMwR)
data("airquality")
air = knnImputation(airquality,k=10)
air$col = c("knn_imputation","notNA")[complete.cases(airquality[,1:2])+1]
ggplot(air,aes(Ozone,fill=col)) + geom_histogram(alpha=0.5,position = "identity")
#绘制插补后的Solar.R和Ozone的散点图
ggplot(air,aes(x=Solar.R,y=Ozone,colour=col)) + geom_point(size=3)


#####5.异常点的检测#####
##(1)单变量
set.seed(0402)
x = rnorm(100) #生成100个标准正态分布的随机数
boxplot.stats(x)$out #检测出来的异常点
boxplot(x) #绘制箱线图

##(2)两变量
set.seed(3148)
x = rnorm(100)
set.seed(3147)
y = rnorm(100)
df = data.frame(x,y)
attach(df)
#分别找出两变量异常点的索引
(a = which(x %in% boxplot.stats(x)$out))
(b = which(y %in% boxplot.stats(y)$out))
detach(df)
#交集
(outlier.list1 = intersect(a, b))
plot(df)
points(df[outlier.list1,], col="red", pch="+", cex=2.5)
#并集
(outlier.list2 = union(a, b))
plot(df)
points(df[outlier.list2,], col="blue", pch="+", cex=2.5)

##(3)3个及以上变量
##局部离群点因子(LOF)
library(DMwR)
iris2 = iris[,1:4] #删除列变量Species,它是一个分类型变量
outlier.scores = lofactor(iris2, k=5) #选择k=5作为近邻标准,用于计算LOF
dec_out = outlier.scores[order(outlier.scores,decreasing = T)];dec_out
#按LOF降序排列,将前5个点作为离群点
outliers = order(outlier.scores,decreasing = T)[1:5] 
#输出异常点编号
print(outliers)

n = nrow(iris2)
labels = 1:n
labels[-outliers] = "."
#结合前两个主成份的双标图呈现异常值
#prcomp()执行了一个主成分分析,并且biplot()使用前两个主成分画出了这些数据
biplot(prcomp(iris2), cex=.6, xlabs = labels)

#使用pairsPlot显示异常值
pch = rep(".", n)
pch[outliers] = "+"
col = rep("black", n)
col[outliers] = "red"
pairs(iris2,col=col,pch=pch)

##K-means算法检测离群点
iris2 = iris[,1:4] #删除列变量Species,它是一个分类型变量
kmeans.result = kmeans(iris2, centers = 3)
#聚类中心
kmeans.result$centers
#类别标签
kmeans.result$cluster
#分配每行数据的聚类中心
centers = kmeans.result$centers[kmeans.result$cluster,]
centers
#计算各点与聚类中心的距离
distances = sqrt(rowSums((iris2-centers)^2))
#按聚类降序排列,将前5个点作为离群点
outliers = order(distances,decreasing = T)[1:5] 
#输出异常点编号
print(outliers)
#以花萼长宽为坐标画出聚类情况
plot(iris2[,c("Sepal.Length","Sepal.Width")], pch="o",col=kmeans.result$cluster,cex=0.3)
#标记聚类中心
points(kmeans.result$centers[,c("Sepal.Length","Sepal.Width")], pch=8,col=1:3,cex=1.5)
#标记离群点
points(iris2[outliers,c("Sepal.Length","Sepal.Width")], pch="+",col=4,cex=1.5)

#####6.变量选择#####
#####过滤法#####
## 低方差变量处理
library(caret)
library(AppliedPredictiveModeling)
data(segmentationOriginal) #加载原始的细胞分割数据集
segData = subset(segmentationOriginal, Case == "Train") #提取其中标识为“Train”的训练样本
dim(segData) #训练样本有1009个观测,119个特征
#删除不需要的三列特征:细胞标识ID(Cell)、是否正确分割(Class)和细胞用于测试集还是训练集(Case)
segData = segData[,-(1:3)] 
#去除对本例无用的二元定性变量,它们的变量名都包含“status”
statusColNum = grep("Status", names(segData))
#删掉定性变量列,得到本例用的数据
segData = segData[,-statusColNum]
#返回该数据中低方差变量所在的列数
nearZeroVar(segData)

## 删除强相关变量
correlations = cor(segData)
dim(correlations)
correlations[1:4,1:4]  #查看前四个变量间的相关性
library(corrplot)
# 可视化展示相关系数矩阵,展示图已根据变量聚类后的结果对变量进行重排
corrplot(correlations, order = "hclust")
# 根据以上算法筛选出相关性最强的变量
highCorr = findCorrelation(correlations, cutoff = 0.75)
length(highCorr)  # 筛选出的变量个数是32个
highCorr
# 去除强相关变量
filteredSegData = segData[,-highCorr]

## 用变量聚类的方法过滤变量
library(Hmisc)
v = varclus(as.matrix(segData))
print(round(v$sim, 2))  # 显示变量的相关系数矩阵
plot(v)  # 显示层次树结构,可以看到很多变量之间有很强的相关性
#将变量聚成30个大类,而后在每个类中挑选一个变量
nvars = 30  
# 标记每类的类别编号(1-30)
tree = cutree(v$hclust,nvars)
# 统计每类的数量
tab = table(tree)
# 先建立长度为30的全0向量,后面用来填充每类中的一个变量
predictors.select = rep(0,30)
for (i in 1:nvars)
{
  # 若某类中只有一个变量,则选择该变量
  if (sum(tree == i) == 1)
    predictors.select[i] = names(tree[tree == i])
  # 若某类变量不止一个,随机取一个变量
  else
    predictors.select[i] = names(sample(tree[tree == i], 1))
}
predictors.select  # 显示随机选择的30个变量

#####变量重要性排序#####
#####(1)输入变量和输出变量都是数值型变量#####
library(AppliedPredictiveModeling)
data(solubility)
## 单变量与因变量的pearson相关系数
cor(solTrainXtrans$NumCarbon, solTrainY)

## 所有数值型变量与因变量的pearson相关系数
# 变量名中包含“FP”的变量是分类变量,将匹配出来并排除掉剩余的就是数值型变量
fpCols = grepl("FP", names(solTrainXtrans))
numericPreds = names(solTrainXtrans)[!fpCols]   #所有的数值型自变量
# 利用apply函数计算所有数值型变量与因变量solTrainY的pearson相关系数
corrValues = apply(solTrainXtrans[, numericPreds],
                   MARGIN = 2, #1表示按行计算,2表示按列计算
                   FUN = function(x, y) cor(x, y),
                   y = solTrainY)
head(corrValues)  #查看前六个
## 所有数值型变量与因变量的spearman相关系数
corrValues1 = apply(solTrainXtrans[, numericPreds],
                    MARGIN = 2,
                    FUN = function(x, y) cor(x, y,method = "spearman"),
                    y = solTrainY)
head(corrValues1) #查看前六个

## 局部加权回归LOESS的伪R2
smoother = loess(solTrainY ~ solTrainXtrans$NumCarbon)
smoother
#lattice包中的xyplot做LOESS图
library(lattice)
xyplot(solTrainY ~ solTrainXtrans$NumCarbon,
       type = c("p", "smooth"),
       xlab = "# Carbons",
       ylab = "Solubility")
#caret包中的filterVarImp
install.packages("caret")
library(caret)
loessResults = filterVarImp(x = solTrainXtrans[, numericPreds],
                            y = solTrainY,
                            nonpara = TRUE)
head(loessResults)
# 按照变量重要性排序,越重要序号越大
aaa = cbind(loessResults,rank(loessResults$Overall))

## 最大信息系数MIC
install.packages("minerva")
library(minerva)
micValues = mine(solTrainXtrans[, numericPreds], solTrainY)
# 计算出若干统计量,其中包括MIC
names(micValues)
head(micValues$MIC)
bbb = cbind(micValues$MIC, rank(micValues$MIC))
cbind(aaa,bbb)

#####(2)输入变量是分类变量输出变量是数值型变量#####
# 查看数据集分类变量的类别数
get_levels = function(x)
{
  out = levels(factor(x))
  out
}
FP_levels = apply(solTrainXtrans[, fpCols],
                  MARGIN = 2,
                  FUN = get_levels)
FP_levels = as.data.frame(t(FP_levels))
#按照FP044分两类,检验因变量均值是否相同
t.test(solTrainY ~ solTrainXtrans$FP044)
levels(factor(solTrainXtrans$FP002))
#分别按照FPxxx分两类,检验因变量均值是否相同,并输出t值和p值
getTstats = function(x, y)
{
  tTest = t.test(y~x)
  out = c(tStat = tTest$statistic, p = tTest$p.value)
  out
}
tVals = apply(solTrainXtrans[, fpCols],
              MARGIN = 2,
              FUN = getTstats,
              y = solTrainY)
## 转置以方便查看
tVals1 = as.data.frame(t(tVals))

head(tVals1)
# 筛选不能拒绝原假设的分类变量
uselessFP = tVals1[tVals1$p>0.05,]

你可能感兴趣的:(R,r语言)