一、相关数据处理R包
1、dplyr包
install.packages("dplyr")
install.packages("Lahman")
install.packages("hflights")
将过长过大的数据集转换为显示更友好的 tbl_df 类型:
hflights_df <- tbl_df(hflights)
dplyr包有五种处理数据的方式:
1)Select(选择)
用列名作参数来选择子数据集:
select(hflights_df, Year, Month, DayOfWeek)
还可以用 : 来连接列名:
select(hflights_df,Year:DayOfWeek)
用 -
来排除列名:
select(hflights_df, -(Year:DayOfWeek))
同样类似于R自带的 subset()
函数
subset(hflights_df,select=c(DepTime,ArrTime))
2)Filter(筛选)
按给定的逻辑判断筛选出符合要求的子数据集, 类似于 base::subset()
函数
filter(hflights_df, Month == 1, DayofMonth == 1)
除了代码简洁外, 还支持对同一对象的任意个条件组合, 如:
filter(hflights_df, Month == 1 | Month == 2)
3)Arrange(排列)
按给定的列名依次对行进行排序
arrange(hflights_df, DayofMonth, Month, Year)
对列名加 desc()
进行倒序:
arrange(hflights_df, desc(ArrDelay))
4)Mutate(变形)
对已有列进行数据运算并添加为新列:
mutate(hflights_df,gain = ArrDelay - DepDelay, speed = Distance / AirTime * 60)
作用与 plyr::mutate()
相同, 与 base::transform()
相似, 优势在于可以在同一语句中对刚增加的列进行操作:
mutate(hflights_df, gain = ArrDelay - DepDelay, gain_per_hour = gain / (AirTime / 60))
5)Summarise(汇总)
对数据框调用其它函数进行汇总操作, 返回一维的结果:
summarise(hflights_df,delay = mean(DepDelay, na.rm = TRUE))
分组动作group_by()
当对数据集通过 group_by()
添加了分组信息后,mutate()
, arrange()
和 summarise()
函数会自动对这些 tbl
类数据执行分组操作 (R语言泛型函数的优势).
例如: 对飞机航班数据按飞机编号 (TailNum) 进行分组, 计算该飞机航班的次数 (count = n()
), 平均飞行距离 (dist = mean(Distance, na.rm = TRUE)
) 和 延时 (delay = mean(ArrDelay, na.rm = TRUE)
)
planes <- group_by(hflights_df, TailNum)
delay <- summarise(planes,count = n(),dist = mean(Distance, na.rm = TRUE), delay = mean(ArrDelay, na.rm = TRUE))
delay <- filter(delay, count > 20, dist < 2000)
连接符%>%
包里还新引进了一个操作符, 使用时把数据名作为开头, 然后依次对此数据进行多步操作.
Batting %>%group_by(playerID) %>%summarise(total = sum(G)) %>%arrange(desc(total)) %>%head(5)
2、data.table包
install.packages("data.table")
library(data.table)
这个包让你可以更快地完成数据集的数据处理工作。放弃选取行或列子集的传统方法,用这个包进行数据处理。用最少的代码,你可以做最多的事。相比使用data.frame,data.table可以帮助你减少运算时间。你一定会对这个包的简洁性感到震惊。
一个数据表格包含三部分,即DT[i, j, by]。你可以理解为我们告诉R用i来选出行的子集,并计算通过by来分组的j。大多数时候,by是用于类别变量的。
data("airquality")
mydata=airquality
mydata=data.table(mydata)
选择第2到4行数据
mydata[2:4,]
mydata[,Temp]
mydata[,.(Temp,Month)]
剔除某一列数据
mydata[,!c("Month"),with=FALSE]
筛选符合条件的数据
mydata[,Month:=6]
汇总计算
mydata[,sum(Ozone,na.rm=TRUE)]
mydata[,.(sum(Ozone,na.rm=TRUE),sd(Ozone,na.rm=TRUE))]
mydata[,v:=mean(Wind+Temp),by=Month]
3、tidyr包
library(tidyr)
library(dplyr)
head(mtcars)
mtcars$car=rownames(mtcars)
mtcars <- mtcars[, c(12, 1:11)]
1)gather函数:将宽数据变成长数据格式,类似于reshape2中的melt函数
gather(data, key, value, ..., na.rm = FALSE, convert = FALSE)
mtcarsNew <- mtcars %>% gather(attribute, value, -car)
2)spread函数:将长数据变成宽数据格式,类似于reshape2中的cast函数
spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE)
mtcarsSpread <- mtcarsNew %>% spread(attribute, value)
head(mtcarsSpread)
3)Unite函数:将两列或更多列整合成一列
unite(data, col, ..., sep = "_", remove = TRUE)
先构造一些虚拟数据
set.seed(1)
date <- as.Date('2016-01-01') + 0:14
hour <- sample(1:24, 15)
min <- sample(1:60, 15)
second <- sample(1:60, 15)
event <- sample(letters, 15)
data <- data.frame(date, hour, min, second, event)
dataNew <- data %>%
unite(datehour, date, hour, sep = ' ') %>%
unite(datetime, datehour, min, second, sep = ':')
4)Separate函数:将一列分成两列或更多列
separate(data, col, into, sep = "[^[:alnum:]]+", remove = TRUE,convert = FALSE, extra = "warn", fill = "warn", ...)
data1 <- dataNew %>%
separate(datetime, c('date', 'time'), sep = ' ') %>%
separate(time, c('hour', 'min', 'second'), sep = ':')
4、 reshape2包
利用reshape2可以实现对宽数据及长数据之间的相关转换。
melt – convert data from wide format to long format
mtcars$car <- rownames(mtcars)
mtcarsMelt <- melt(mtcars)
head(mtcarsMelt)
mtcarsMelt <- melt(mtcars, id.vars = c('cyl', 'gear'), variable.name = 'carVariable', value.name = 'carValue')
head(mtcarsMelt)
tail(mtcarsMelt)
cast – convert data from long format to wide format
dcast – returns a dataframe as the output
acast – returns a vector/matrix/array as the output
mtcarsMelt <- melt(mtcars, id.vars = c('car','cyl', 'gear'), variable.name = 'carVariable', value.name = 'carValue')
mtcarsCast <- dcast(mtcarsMelt, car + cyl ~ carVariable)
head(mtcarsCast)
二、异常值数据处理
下面的程序利用Tukey’s method来确认异常值(高于和低于1.51.5*IQR),对比了去除异常值前后的箱线图、直方图。对于异常值用NA进行替换。一般通过绘制盒形图来查看哪些点是离群点,而离群点的判断标准是四分位数与四分位距为基础。即离群点超过上四分位数的1.5倍四分位距或低于下四分位数的1.5倍四分位距。
#表达式对象的求值 eval
#substitute 函数可以在表达式中使用变量,变量的值随运行进程而被替换。substitute函数中需要替换的变量用列表参数方式给出
outlierKD <- function(dt, var) {
var_name <- eval_r(substitute(var),eval_r(dt))
tot <- sum(!is.na(var_name))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(var_name, main="With outliers")
hist(var_name, main="With outliers", xlab=NA, ylab=NA)
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
boxplot(var_name, main="Without outliers")
hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
title("Outlier Check", outer=TRUE)
na2 <- sum(is.na(var_name))
cat("Outliers identified:", na2 - na1, "\n")
cat("Propotion (%) of outliers:", round((na2 - na1) / tot*100, 1), "\n")
cat("Mean of the outliers:", round(mo, 2), "\n")
m2 <- mean(var_name, na.rm = T)
cat("Mean without removing outliers:", round(m1, 2), "\n")
cat("Mean if we remove outliers:", round(m2, 2), "\n")
response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
if(response == "y" | response == "yes"){
dt[as.character(substitute(var))] <- invisible(var_name)
assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
cat("Outliers successfully removed", "\n")
return(invisible(dt))
} else{
cat("Nothing changed", "\n")
return(invisible(var_name))
}
}
三、在R中利用数据库处理数据
dplyr 包支持sqlite, mysql
and postgresql
数据库,下面我们将看看R和sqlite数据库如何关联利用,我们将下载病人的人口统计信息、他们使用的药物、用过的测量仪器、反应、效果。我们将所有的数据集存入数据库,并且利用dplyr来处理数据库。
加载相关R包:
library(dplyr)
library(ggplot2)
library(data.table)
下载不良反应相关数据:
year_start=2013
year_last=2015
for (i in year_start:year_last){
j=c(1:4)
for (m in j){
url1<-paste0("http://www.nber.org/fda/faers/",i,"/demo",i,"q",m,".csv.zip")
download.file(url1,dest="data.zip") # Demography
unzip ("data.zip")
url2<-paste0("http://www.nber.org/fda/faers/",i,"/drug",i,"q",m,".csv.zip")
download.file(url2,dest="data.zip") # Drug
unzip ("data.zip")
url3<-paste0("http://www.nber.org/fda/faers/",i,"/reac",i,"q",m,".csv.zip")
download.file(url3,dest="data.zip") # Reaction
unzip ("data.zip")
url4<-paste0("http://www.nber.org/fda/faers/",i,"/outc",i,"q",m,".csv.zip")
download.file(url4,dest="data.zip") # Outcome
unzip ("data.zip")
url5<-paste0("http://www.nber.org/fda/faers/",i,"/indi",i,"q",m,".csv.zip")
download.file(url5,dest="data.zip") # Indication for use
unzip ("data.zip")
}
将季度的数据文件整合在一起,对每一种分类建立一个单一的数据集
Demography
filenames <- list.files(pattern="^demo.*.csv", full.names=TRUE)
demography = rbindlist(lapply(filenames, fread,
select=c("primaryid","caseid","age","age_cod","event_dt",
"sex","wt","wt_cod","occr_country"),data.table=FALSE))
str(demography)
Drug
filenames <- list.files(pattern="^drug.*.csv", full.names=TRUE)
drug = rbindlist(lapply(filenames, fread,
select=c("primaryid","drug_seq","drugname","route"),data.table=FALSE))
str(drug)
Diagnoses/Indications
filenames <- list.files(pattern="^indi.*.csv", full.names=TRUE)
indication = rbindlist(lapply(filenames, fread,
select=c("primaryid","indi_drug_seq","indi_pt"),data.table=FALSE))
str(indication)
Outcomes
filenames <- list.files(pattern="^outc.*.csv", full.names=TRUE)
outcome = rbindlist(lapply(filenames, fread,
select=c("primaryid","outc_cod"),data.table=FALSE))
str(outcome)
Reaction (Adverse Event)
filenames <- list.files(pattern="^reac.*.csv", full.names=TRUE)
reaction = rbindlist(lapply(filenames, fread,
select=c("primaryid","pt"),data.table=FALSE))
str(reaction)
创建一个数据库
my_database<- src_sqlite("adverse_events", create = TRUE)
# create =TRUE creates a new database
将数据集导入到数据库
copy_to(my_database,demography,temporary = FALSE) # uploading demography data
copy_to(my_database,drug,temporary = FALSE) # uploading drug data
copy_to(my_database,indication,temporary = FALSE) # uploading indication data
copy_to(my_database,reaction,temporary = FALSE) # uploading reaction data
copy_to(my_database,outcome,temporary = FALSE) #uploading outcome data
连接数据库
my_db <- src_sqlite("adverse_events", create = FALSE)
# create is false now because I am connecting to an existing database
列出数据库中存在的表格
src_tbls(my_db)
"demography" "drug" "indication" "outcome" "reaction" "sqlite_stat1"
数据库查询
demography = tbl(my_db,"demography" )
class(demography)
tbl_sqlite" "tbl_sql" "tbl"
head(demography,3)
US = filter(demography, occr_country=='US') # Filtering demography of patients from the US
US$query
SELECT "primaryid", "caseid", "age", "age_cod", "event_dt", "sex", "wt", "wt_cod", "occr_country"
FROM "demography"
WHERE "occr_country" = 'US'
连接数据库中的其他表格
drug = tbl(my_db,"drug" )
indication = tbl(my_db,"indication" )
outcome = tbl(my_db,"outcome" )
reaction = tbl(my_db,"reaction" )
Find the top ten countries with the highest number of adverse events
demography%>%group_by(Country= occr_country)%>%
summarize(Total=n())%>%
arrange(desc(Total))%>%
filter(Country!='')%>% head(10)
We can also include ggplot in the chain:
demography%>%group_by(Country= occr_country)%>% #grouped by country
summarize(Total=n())%>% # found the count for each country
arrange(desc(Total))%>% # sorted them in descending order
filter(Country!='')%>% # removed reports that does not have country information
head(10)%>% # took the top ten
mutate(Country = factor(Country,levels = Country[order(Total,decreasing =F)]))%>%
ggplot(aes(x=Country,y=Total))+geom_bar(stat='identity',color='skyblue',fill='#b35900')+
xlab("")+ggtitle('Top ten countries with highest number of adverse event reports')+
coord_flip()+ylab('Total number of reports')
Let’s join demography, outcome and reaction based on primary id:
inner_joined = demography%>%inner_join(outcome, by='primaryid',copy = TRUE)%>%
inner_join(reaction, by='primaryid',copy = TRUE)
We can also use primary key and secondary key in our joins. Let’s join drug and indication using two keys (primary and secondary keys).
drug_indication= indication%>%rename(drug_seq=indi_drug_seq)%>%
inner_join(drug, by=c("primaryid","drug_seq"))