train<-read.csv("train.csv")
test <- read.csv("test.csv")
library('dplyr')
binddata<-bind_rows(train,test) #合并train和test数据
str(binddata)
summary(binddata)
binddata$Survived <- factor(binddata$Survived)
library(ggplot2)
library(ggthemes)
ggplot(data = binddata[1:nrow(train),],aes(x = Pclass, y = ..count.., fill=Survived)) +geom_bar(stat = "count", position='dodge') +
xlab('客舱等级') + ylab('乘客数量') + ggtitle('不同客舱等级对存活率的影响') +
scale_fill_manual(values = c("red","green")) +theme_economist(base_size=16)+
geom_text(stat = "count", aes(label = ..count..), position=position_dodge(width=1), vjust=-0.5)
library(InformationValue)
WOETable(X=factor(binddata$Pclass[1:nrow(train)]), Y=binddata$Survived[1:nrow(train)])
IV(X=factor(binddata$Pclass[1:nrow(train)]), Y=binddata$Survived[1:nrow(train)])
binddata$Title <- gsub('(.*, )|(\\..*)', '', binddata$Name)
table(binddata$Title)
binddata$Title[binddata$Title == 'Ms'] <- 'Miss'
rare_title <- c('Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don',Dr', 'Major', '
Rev', 'Sir', 'Jonkheer') #把较少的称呼抬头整合一起
binddata$Title[binddata$Title == 'Mlle'] <- 'Miss' #把称呼Melle归入Miss,下同
binddata$Title[binddata$Title == 'Ms'] <- 'Miss'
binddata$Title[binddata$Title == 'Mme'] <- 'Mrs'
binddata$Title[binddata$Title %in% rare_title] <- 'Rare Title'
binddata$Fsize <- binddata$SibSp + binddata$Parch + 1 # 新建变量“Fsize”,意思是家庭规模
binddata$Surname <- sapply(binddata$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]}) #从Name中提取出Surname
binddata$FamilyID <- paste(as.character(binddata$FSize), binddata$Surname, sep="") #形成新变量”FamilyID“,
binddata$FamilyID[binddata$Fsize <= 2] <- 'Small' #对”FamilyID“赋值,对于Fsize小于等于2的标记为Small
famIDs <- data.frame(table(binddata$FamilyID)) #删除错误的famIDs
famIDs <- famIDs[famIDs$Freq <= 2,]
FamilyID[binddata$FamilyID %in% famIDs$Var1] <- 'Small'
4.3 新建TicketCount变量,并赋值
binddatat <- binddata %>% #通过票号进行分组,保存为单独的数据框binddatat
group_by(Ticket) %>%
count()
table(binddatat$n)
说明1309个人中,713种票号是不重复的,132种票号出现了2次,占264人,以此类推……
binddatat <- as.data.frame(binddatat)
binddata$TicketCount <- apply(binddata, 1, function(x)binddatat[which(binddatat['Ticket'] == x['Ticket']), 2]) #对binddata数据集的TicketCount赋值
binddata$TicketCount[binddata$TicketCount != 1] <-'share'
binddata$TicketCount[binddata$TicketCount == 1] <-'unique' #根据标识不为1的赋值为share,否则赋值为unique
colSums(sapply(binddata,is.na))
sapply(binddata,function(x)sum(x==""))
which(is.na(binddata$Fare))
结果发现Age有263个缺失值,Fare中NA的行号是1044
binddata[1044,]$Fare <- 8.05 #通过binddata[1044,]观察信息,可得到他的Pclass和Embarked,汇总分析并赋值
binddata$Embarked[c(62, 830)] <- 'S' #Embarked也同样,得到基本信息,找到大类,汇总分析并赋值
library('mice')
library('lattice')
newdata <- bind_rows(train,test)
imp <- mice(newdata[,-2],m=5,method = 'rf',maxit = 500,seed = 5514)
miceout <- complete(imp)
binddata$Age <- miceout$Age
五、建立模型并预测
library(randomForest)
library('party')
library('zoo')
binddata$Age <- factor(binddata$Age) #将变量转化为factor格式,下同
binddata$Embarked <- factor(binddata$Embarked)
binddata$Title <- factor(binddata$Title)
binddata$Fsize <- factor(binddata$Fsize)
binddata$FamilyID <- factor(binddata$FamilyID)
binddata$TicketCount <- factor(binddata$TicketCount)
先把合并的数据再分开
train1 <- binddata[1:891,]
test1 <- binddata[892:1309,]
然后建立随机森林模型
model <- cforest(Survived ~ Pclass +Sex + Age + Fare + Embarked + Title + Fsize +
FamilyID + TicketCount, data = train1,controls = cforest_unbiased(ntree=2000,mtry=3))
Prediction <- predict(model, test1, OOB=TRUE, type = "response")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit,file = "G:/forest.csv",row.names = FALSE)