统计缺失个数
na_count<-function(data,x){
sum(data[,x]==""|is.na(data[,x]))
}
eg:
na_count(Data_Tmp,"hypertension_before")
插补
centralImputation<-function (data,xx)
{
for (i in c(xx)) {
if (any(idx <- which(data[, i]==""|is.na(data[, i]))))
data[idx,i]<-data[sample(1:length(data[,i]),length(idx)),i]
}
data
}
one hot procedure
for(i in 1:ncol(Data_Tmp))
{
if(class(Data_Tmp[,i])=="character"){
Data_Tmp[,i]<-as.factor(Data_Tmp[,i])
}
}
NN <- sum(sapply(Data_Tmp, FUN = class) != "factor")
NRT <- dim(Data_Tmp)[1]
NR_Tmp <- dim(Data_Tmp)[2]
Matrix_Nu <- matrix(0, nrow = NRT, ncol = NN)
colnames(Matrix_Nu) <- names(Data_Tmp)[sapply(Data_Tmp, FUN = class) != "factor"]
J <- 1
for(i in 1:NR_Tmp)
{
if(!is.factor(Data_Tmp[,i]))
{
Matrix_Nu[,J] <- Data_Tmp[,i]
J <- J+1
}
}
NC <- sum(sapply(Data_Tmp, FUN = class) == "factor")
Matrix_Ca <- matrix(factor(0), nrow = NRT, ncol = NC)
colnames(Matrix_Ca) <- names(Data_Tmp)[sapply(Data_Tmp, FUN = class) == "factor"]
J <- 1
for(i in 1:NR_Tmp)
{
if(is.factor(Data_Tmp[,i]))
{
QQbq <- Data_Tmp[,i]
levels(QQbq) <- c(levels(QQbq), "NA")
QQbq[is.na(QQbq)] <- "NA"
Matrix_Ca[,J] <- QQbq
J <- J+1
}
}
DF_Ca <- as.data.frame(Matrix_Ca)
for(i in 1:ncol(DF_Ca)){
DF_Ca[,i]<-as.factor(DF_Ca[,i])
}
DF_Ca<-DF_Ca[,as.data.frame(DF_Ca[1,] %>% c %>% sapply(FUN = nlevels))[,1]!=1]
(DF_Ca[1,] %>% c %>% sapply(FUN = nlevels))
xnam <- names(DF_Ca)
fmla <- as.formula(paste("~ ", paste(xnam, collapse = "+")) )
Ca_M <- model.matrix(fmla, DF_Ca)
Training_Final <- cbind(Matrix_Nu, Ca_M)
DF_Ca %>% as_tibble %>% apply(MARGIN = 2, FUN = levels)
Training_Finals <- cbind(scale(Matrix_Nu), Ca_M)
Training_Finals %>% glimpse
DF_Ca[1,] %>% c %>% sapply(FUN = nlevels)
head(Matrix_Nu)
Training_Final1s<-as.data.frame(Training_Finals)