R语言练习-利用决策树模型分析泰坦尼克生还率(3)

R语言练习-利用决策树模型分析泰坦尼克生还率

三、利用ticket识别家庭,后分析

1.添加家庭号

#计算各家庭成员的生还率
f$train$type<-"T"
f$validation$type<-"V"
all<-rbind(f$train,f$validation)
ctree_model<-ctree(
    survived~pclass+sex+age+sibsp+parch+fare+embarked,
    data=f$train
  )
all$prob<-sapply(
  predict(ctree_model,newdata=all,
                     type="prob"),
  function(result){result[1]})
#添加家庭ID
library(plyr)
family_idx<-0
ticket_based_family_id<-ddply(all,.(ticket),function(rows){
  family_idx<<-family_idx+1  #family_idx是在函数外部声明的变量,使用<<-运算符进行赋值
  return(data.frame(family_id=paste0("TICKET_",family_idx)))
})
#向数据框all中添加family_id列
all<-adply(all,
           1,
         function(row){
           family_id<-NA
           if(!is.na(row$ticket)){
             family_id<-subset(ticket_based_family_id,
                               ticket==row$ticket)$family_id
           }
           return(data.frame(family_id=family_id))
         })
#使用ddply()集中具有相同family_id的行
all<-ddply(all,
           .(family_id),
           function(rows){
             rows$avg_prob<-mean(rows$prob)
             return(rows)
           })

2.使用age判断是子女还是父母

#使用age判断是子女还是父母
all<-ddply(all,.(family_id),function(rows){
  rows$maybe_parent<-FALSE
  rows$maybe_child<-FALSE
  if(NROW(rows)==1||
    sum(rows$parch)==0||
    NROW(rows)==sum(is.na(rows$age))){
    return(rows)
  }
  max_age<-max(rows$age,na.rm=TRUE)
  min_age<-min(rows$age,na.rm=TRUE)
  return(adply(rows,1,function(row){
    if(!is.na(row$age)&&!is.na(row$sex)){
      row$maybe_parent<-(max_age-row$age)<10
      row$maybe_child<-(row$age-min_age)<10
    }
    return(row)
  }))
})

3.分别计算父母的平均生还率与子女的平均生还率

#分别计算父母的平均生还率与子女的平均生还率
all<-ddply(all,.(family_id),function(rows){
  rows$avg_parent_prob<-rows$avg_prob
  rows$avg_child_prob<-rows$avg_prob
  if(NROW(rows)==1||
     sum(rows$parch)==0){
    return(rows)
  }
  parent_prob<-subset(rows,maybe_parent==TRUE)[,"prob"]
  if(NROW(parent_prob)>0){
    rows$avg_parent_prob<-mean(parent_prob) 
  }
  child_prob<-c(subset(rows,maybe_child==TRUE)[,"prob"])
  if(NROW(child_prob)>0){
    rows$avg_childt_prob<-mean(child_prob) 
  }
  return(rows)
})

4.再次使用ctree模型预测生还率

#ctree模型
  f$train<-subset(all,type=="T")
  f$validation<-subset(all,type=="V")
  (m<-ctree(survived~pclass+sex+age+sibsp+parch+fare+embarked
            +maybe_parent+maybe_child+age+sex+avg_parent_prob+avg_child_prob,
            data=f$train))
  print(m)
  predicted<-predict(m,newdata=f$validation)
  return(list(actual=f$validation$survived,predicted=predicted))
}
family_accuracy<-evaluation(family_result)

你可能感兴趣的:(r语言与数据分析)