Kaggle | Titanic: Machine Learning from Disaster

Titanic题目链接


下面解释一下训练集之中的数据:
PassengerId    旅客ID
Survived       是否活下来了,1:yes  0:no
Pclass         旅客等级
Name           名字
Sex            性别
Age            年龄
SibSp          有多少兄弟姐妹/配偶同船 Number of Siblings/Spouses Aboard
Parch          有多少父母/子女同船 Number of Parents/Children Aboard
Ticket         船票号码?
Fare           船票收费    
Cabin          所在小屋
Embarked       登船城市 Port of Embarkation C Q S 分别代表不同的城市

R语言代码

有几点要注意的:

1)为了好一些预测准确率,创建了一些新的feature

2)train test最好并在一起处理,这样就不会有train和test factor level不一致导致random forest不work的问题

3)cforest感觉比randomForest好使,变量中有NA也不会报错,而且支持的factor level更多


library(ggplot2)
library(party)
library(rpart)

extractFeature <- function(data) {
  
  features <- c("Pclass", "Age", "SibSp", "Parch", "Fare", "Cat")
  fea <- data[, features]
  
  fea$Title = sapply(as.character(data$Name), function(x) strsplit(x,'[.,]')[[1]][2])
  fea$Title = gsub(' ', '', fea$Title)
  #print(class(fea$Title))
  fea$Title[fea$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
  fea$Title[fea$Title %in% c('Dona', 'Lady', 'the Countess', 'theCountess', 'Jonkheer')] <- 'Lady'
  fea$Title[fea$Title %in% c('Mme', 'Ms')] <- 'Mrs'
  fea$Title[fea$Title %in% c('Mlle')] <- 'Miss'
  fea$Title = as.factor(fea$Title)
  print(summary(fea$Title))
  
  #predict age
  predict_age <- rpart(Age ~ Title, data = fea[!is.na(fea$Age), ], method = "anova")
  fea$Age[is.na(fea$Age)] <- predict(predict_age, fea[is.na(fea$Age), ])
    
  #predict fare
  predict_fare <- rpart(Fare ~ Pclass, data = fea[!is.na(fea$Fare) & fea$Fare != 0, ], method = "anova")
  fea$Fare[is.na(fea$Fare) | fea$Fare == 0] <- predict(predict_fare, fea[is.na(fea$Fare) | fea$Fare == 0, ])
  
  fea$Gender <- 0
  fea$Gender <- as.numeric(sapply(data$Sex, function(x) {as.character(x) == as.character("male")}))
  
  fea$Mother <- 0
  fea$Mother[fea$Gender == 0 & fea$Parch > 0 & fea$Age > 18 & fea$Title != 'Miss'] <- 1
  
  fea$Child <- 0
  fea$Child[fea$Parch > 0 & fea$Parch <= 18] <- 1
  
  data$Embarked <- as.character(data$Embarked)
  data$Embarked[data$Embarked != "S" & data$Embarked != "C" & data$Embarked != "Q"] <- which.max(table(data$Embarked))
  fea$Port_C = as.numeric(sapply(data$Embarked, function(x) {as.character(x) == as.character("C")}))
  fea$Port_Q = as.numeric(sapply(data$Embarked, function(x) {as.character(x) == as.character("Q")}))
  fea$Port_S = as.numeric(sapply(data$Embarked, function(x) {as.character(x) == as.character("S")}))
  
  fea$Surname <- sapply(as.character(data$Name), function(x) strsplit(x, '[.,]')[[1]][1]);
  family_id <- paste0(fea$FamilySize, fea$Surname)
  fea$Family_id <- as.factor(family_id)
  family_table <- data.frame(table(fea$Family_id))
  SmallFamily <- family_table$Var1[family_table$Freq <= 2]
  family_id[family_id %in% SmallFamily] <- "small"
  fea$Family_id2 <- as.factor(family_id)
  #print(levels(fea$Family_id2))
    
  fea$Deck <- sapply(as.character(data$Cabin), function(x) strsplit(x, NULL)[[1]][1]);
  #print(class(fea$Deck))
  fea$Deck[is.na(fea$Deck)] <- "EMPTY"
  fea$Deck <- as.factor(fea$Deck)
  
  fea$CabinNum <- sapply(as.character(data$Cabin), function(x) strsplit(x, NULL)[[1]][2]);
  fea$CabinNum <- as.numeric(fea$CabinNum)
  num <- subset(fea$CabinNum, !is.na(fea$CabinNum))
  CabinNumCluster <- kmeans(num, 3)
  #print(fea$CabinNum)
  #print(summary(CabinNumCluster$cluster))
  fea$CabinPos[!is.na(fea$CabinNum)] <- CabinNumCluster$cluster;
  print(class(fea$CabinPos))
  fea$CabinPos <- as.factor(fea$CabinPos)
  levels(fea$CabinPos) <- c("Front", "Middle", "End")
  fea$CabinNum <- NULL
  
  print(names(fea))
  return (fea)
}

set.seed(1)
train <- read.csv("input/train.csv", header = T)
test <- read.csv("input/test.csv", header = T)
train$Cat <- "train"
test$Cat <- "test"

test$Survived <- NA
full <- rbind(train, test)

full <- extractFeature(full)
feaTrain <- full[full$Cat == "train", ]
feaTest <- full[full$Cat == "test", ]

#rf <- randomForest(feaTrain, as.factor(train$Survived), ntree = 100, importance = TRUE)

feaTrain <- data.frame(Survived = as.factor(train$Survived), feaTrain)

rf <- cforest(feaTrain$Survived ~ Pclass + Age + SibSp + Parch + Fare + Gender + Title + Mother + Child +
                Port_S + Port_Q+ Port_C + Family_id2 + CabinPos + Deck, data = feaTrain, 
                controls=cforest_unbiased(ntree=200, mtry=3))
pre <- predict(rf, newdata = feaTest, OOB = TRUE, type = "response")

pre <- predict(rf, newdata = feaTest)

out <- data.frame(test$PassengerId, pre)

names(out) <- cbind("PassengerId", "Survived")

write.csv(out, file = "Survived.csv", row.names = FALSE)

pre_train <- predict(rf, newdata = feaTrain)

out_train <- data.frame(train$Survived, pre_train)

names(out_train) <- cbind("Survived", "pre")
print(sum(out_train$Survived == out_train$pre) / length(out_train$Survived))



你可能感兴趣的:(kaggle,data,mining,and,analysis)