Titanic: Machine Learning from Disaster

https://www.kaggle.com/c/titanic

2015.9.1
随机森林测试:
特征:
pclass+age+sibsp+parch+fare+sexnum+sex.name

树是50课时候,训练集90%正确率
15000课时候,测试集77.97%正确率,和给的例子结果一样

上面如果再增加其他原始特征,测试集的变差了。

特征的问题:

性别特征

年龄特征

家庭成员特征

组队特征

fare中有80%的数据都小于40

2015.9.9
方法:
决策树结果,随机森林结果,性别预测结果中生存两次的认为最终生存
结果:
准确率:0.8086124 预测对了338个 比之前的提高了7个
0 1
275 143

把两个年龄小于两岁的survive设为1,降低了,错了两个,这个所在的行号是:355和285

,上面只是投票多的获胜。

setwd("E:/kaggle/titanic/data")
train = read.csv("train.csv")
test = read.csv("test.csv")

test$survived <- 0
comb<- rbind(train,test)
comb$name <- as.character(comb$name)
# strsplit(comb$name,split='[,.]')
# strsplit(comb$name,split='[,.]')[1]
# strsplit(comb$name,split='[,.]')[[1]][2]
# 根据名字划分 名和姓 称号
comb$surname <- sapply(comb$name,FUN=function(x){strsplit(x,split='[,.]')[[1]][1]})
comb$title <-sapply(comb$name,FUN=function(x){strsplit(x,split='[,.]')[[1]][2]})

comb$title <- sub(' ','',comb$title)
comb$title[comb$title%in%c('Mme','Mlle')] <- 'Mlle'
comb$title[comb$title%in%c('Capt','Don','Major','Sir')] <- 'Sir'
comb$title[comb$title%in%c('Dona','Lady',"Countess",'Jonkheer')] <- 'Lady'

comb$title <- as.factor(comb$title)

comb$embarked[which(comb$embarked=='')] <- 'S'
comb$embarked <- as.factor(comb$embarked)
comb$fare[which(comb$fare=='')] <-median(comb$fare,na.rm=TRUE)

comb$fare[which(comb$fare >= 40)] = 40 

comb$family_size <- comb$sibsp + comb$parch + 1 
comb$family_id <- paste(as.character(comb$family_size),comb$surname,sep="")

comb$isfamily <- 2 
comb$isfamily[which(comb$family_size==1)] = 1  
library("rpart")
library("rpart.plot")

get_age_model <- rpart(age ~ pclass+sex+sibsp+parch+fare+embarked+title+family_size+isfamily,
                        data = comb[!is.na(comb$age),],method="anova")
comb$age[is.na(comb$age)] <- predict(get_age_model,comb[is.na(comb$age),])

train_new <- comb[c(1:891),]
test_new <- comb[c(892:1309),]

# 本地测试
# val = sample(1:nrow(train_new),0.6*nrow(train_new))
# sub_train_new <- train_new[val,]
# sub_test_new <- train_new[-val,]


# tree_model <- rpart(survived~pclass+sex+sibsp+parch+fare+embarked+title+family_size+isfamily
# 
# ,data = sub_train_new,method = "class",control=rpart.control(cp=0.0001))
# 
# prp(tree_model,type=4,extra = 100)
# 
# sub_test_new$predict <- predict(tree_model,sub_test_new,type = "class")
# 
# acc = length(which(sub_test_new$survived == sub_test_new$predict))/nrow(sub_test_new)
# 
# acc

# 决策树
tree_model <- rpart(survived~pclass+sex+sibsp+parch+fare+embarked+title+family_size

                    ,data = train_new,method = "class",control=rpart.control(cp=0.0001))



rpartsurvived <- predict(tree_model,test_new,type = "class")

# result <- data.frame(PassengerId = test_new$passengerid, Survived = test_new$survived)
# 
# write.csv(result,file="Submissions//rpart_add_family.csv",row.names=F)


# 随机森林
library(randomForest)
train_new <- as.data.frame(train_new)
train_new$survived <- as.factor(train_new$survived)
model <- randomForest(survived~pclass+sex+sibsp+parch+fare+embarked+title+family_size,
                      data=train_new,ntree=50,importance = TRUE)

randomForestsurvived <- predict(model,test_new)

gendersurvived <- read.csv("Submissions//genderclassmodel.csv")

rpartsurvived <- as.data.frame(rpartsurvived)
randomForestsurvived <- as.data.frame(randomForestsurvived)
rpart_RF_gender <- cbind(rpartsurvived,randomForestsurvived,gendersurvived[,2])
colnames(rpart_RF_gender) <- c("rpart","RF","gender")
write.csv(rpart_RF_gender,file="Submissions//all_result_rpart_RF_gender.csv",row.names=F)

# 按照行求和
rpart_RF_gender<- read.csv("Submissions//all_result_rpart_RF_gender.csv")
sum_class <- apply(rpart_RF_gender,1,sum)

# 生存两次的为最后生存的
val = which(sum_class>=2)

summary_rpart_rf_gender<- data.frame(PassengerId = test_new$passengerid)

summary_rpart_rf_gender$Survived <-0

summary_rpart_rf_gender$Survived[val] <-1
write.csv(summary_rpart_rf_gender,file="Submissions//summary_rpart_RF_gender.csv",row.names=F)

# 结果 :0.8086124 准确率 预测对了338个 比之前的提高了7个

你可能感兴趣的:(机器学习)