import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,\
ExtraTreesClassifier,VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
train = pd.read_csv("C:/Code/Kaggle/Titanic/train.csv")
test = pd.read_csv("C:/Code/Kaggle/Titanic/test.csv")
IDtest = test["PassengerId"]
def detect_outliers(df,n,features):
outlier_indices = []
for col in features:
Q1 = np.percentile(df[col],25)
Q3 = np.percentile(df[col],75)
IQR = Q3 - Q1
outlier_step = 1.5 * IQR
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
outlier_indices.extend(outlier_list_col)
outlier_indices = Counter(outlier_indices)
multiple_outliers = list(k for k, v in outlier_indices.items() if v>n)
return multiple_outliers
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])
train.loc[Outliers_to_drop]
train = train.drop(Outliers_to_drop,axis=0).reset_index(drop=True)
train_len = len(train)
dataset = pd.concat([train,test], axis=0).reset_index(drop=True)
dataset.tail()
#dataset = dataset.fillna(np.nan)
dataset.isnull().sum()
train.info()
train.isnull().sum()
train.describe()
g = sns.heatmap(train[["Survived","SibSp","Parch","Age","Fare"]].corr(), annot=True, fmt=".2f", cmap = "coolwarm")
g = sns.factorplot(x="SibSp",y="Survived",data=train,kind="bar")
g = g.set_ylabels("survival probability")
g = sns.factorplot(x="Parch",y="Survived",data=train,kind="bar")
g = g.set_ylabels("survival probability")
g = sns.kdeplot(train["Age"][(train["Survived"] == 0) & (train["Age"].notnull())], color="Red", shade = True)
g = sns.kdeplot(train["Age"][(train["Survived"] == 1) & (train["Age"].notnull())], ax =g, color="Blue", shade= True)
g.set_xlabel("Age")
g.set_ylabel("Frequency")
g = g.legend(["Not Survived","Survived"])
# Explore Age vs Sex, Parch , Pclass and SibSP
g = sns.factorplot(y="Age",x="Sex",data=dataset,kind="box")
g = sns.factorplot(y="Age",x="Sex",hue="Pclass", data=dataset,kind="box")
g = sns.factorplot(y="Age",x="Parch", data=dataset,kind="box")
g = sns.factorplot(y="Age",x="SibSp", data=dataset,kind="box")
# convert Sex into categorical value 0 for male and 1 for female
dataset["Sex"] = dataset["Sex"].map({"male": 0, "female":1})
g = sns.heatmap(dataset[["Age","Sex","SibSp","Parch","Pclass"]].corr(),cmap="coolwarm",annot=True)
# Filling missing value of Age
## Fill Age with the median age of similar rows according to Pclass, Parch and SibSp
# Index of NaN age rows
index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index)
for i in index_NaN_age :
age_med = dataset["Age"].median()
age_pred = dataset["Age"][((dataset['SibSp'] == dataset.iloc[i]["SibSp"]) & (dataset['Parch'] == dataset.iloc[i]["Parch"]) & (dataset['Pclass'] == dataset.iloc[i]["Pclass"]))].median()
if not np.isnan(age_pred) :
dataset['Age'].iloc[i] = age_pred
else :
dataset['Age'].iloc[i] = age_med
dataset.tail()
g = sns.factorplot(x="Survived", y = "Age",data = train, kind="box")
g = sns.factorplot(x="Survived", y = "Age",data = train, kind="violin")
#Fill Fare missing values with the median value
dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].median())
g = sns.distplot(dataset["Fare"], color="m")
g = g.legend(loc="best")
# Apply log to Fare to reduce skewness distribution
dataset["Fare"] = dataset["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
g = sns.distplot(dataset["Fare"], color="b")
g = g.legend(loc="best")
g = sns.factorplot(x="Sex",y="Survived",data=train,kind="bar")
g = g.set_ylabels("Survival Probability")
train[["Sex","Survived"]].groupby('Sex').mean()
g = sns.factorplot(x="Pclass",y="Survived",data=train,kind="bar", size = 6 ,
palette = "muted")
g = g.set_ylabels("survival probability")
g = sns.factorplot(x="Pclass", y="Survived", hue="Sex", data=train,
size=6, kind="bar", palette="muted")
g = g.set_ylabels("survival probability")
#Fill Embarked nan values of dataset set with 'S' most frequent value
dataset["Embarked"] = dataset["Embarked"].fillna("S")
g = sns.factorplot(x="Embarked", y="Survived", data=train,
size=6, kind="bar", palette="muted")
g = g.set_ylabels("survival probability")
# Explore Pclass vs Embarked
g = sns.factorplot("Pclass", col="Embarked", data=train,
size=6, kind="count", palette="muted")
g = g.set_ylabels("Count")
dataset["Name"].head()
# Get Title from Name
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"].head()
g = sns.countplot(x="Title",data=dataset)
g = plt.setp(g.get_xticklabels(), rotation=45)
# Convert to categorical values Title
dataset["Title"] = dataset["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset["Title"] = dataset["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
dataset["Title"] = dataset["Title"].astype(int)
g = sns.countplot(dataset["Title"])
g = g.set_xticklabels(["Master","Miss/Ms/Mme/Mlle/Mrs","Mr","Rare"])
g = sns.factorplot(x="Title",y="Survived",data=dataset[:train_len],kind="bar")
g = g.set_xticklabels(["Master","Miss-Mrs","Mr","Rare"])
g = g.set_ylabels("survival probability")
# Drop Name variable
dataset.drop(labels = ["Name"], axis = 1, inplace = True) #inplace为True时返回None,为默认False时返回dataset
# convert to indicator values Title and Embarked
Title_dummies = pd.get_dummies(dataset['Title'],prefix='Title')
dataset = dataset.join(Title_dummies).drop(['Title'],axis=1)
#dataset = pd.get_dummies(dataset, columns = ["Title"])
dataset.drop(['Title_3'],axis=1,inplace=True) #这里去掉存活率最低的一列(冗余特征)
# Create a family size descriptor from SibSp and Parch
dataset["Fsize"] = dataset["SibSp"] + dataset["Parch"] + 1
g = sns.factorplot(x="Fsize",y="Survived",data = dataset)
g = g.set_ylabels("Survival Probability")
# Create new feature of family size
dataset['Single'] = dataset['Fsize'].map(lambda s: 1 if s == 1 else 0)
dataset['SmallF'] = dataset['Fsize'].map(lambda s: 1 if s == 2 else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
dataset['LargeF'] = dataset['Fsize'].map(lambda s: 1 if s >= 5 else 0)
dataset.drop(['Fsize','SibSp','Parch'],axis=1,inplace=True)
dataset.columns
dataset[:train_len][['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Embarked', ascending=True)
#dataset = pd.get_dummies(dataset, columns = ["Embarked"], prefix="Em")
Embarked_dummies = pd.get_dummies(dataset['Embarked'], prefix='Em')
dataset = dataset.join(Embarked_dummies).drop(['Embarked'],axis=1)
dataset.drop(['Em_S'],axis=1,inplace=True)
dataset.columns
dataset["Cabin"].head()
dataset["Cabin"].describe()
dataset["Cabin"].isnull().sum()
dataset["Cabin"][dataset["Cabin"].notnull()].head()
# Replace the Cabin number by the type of cabin 'X' if not
dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin'] ])
g = sns.countplot(dataset["Cabin"],order=['A','B','C','D','E','F','G','T','X'])
g = sns.factorplot(y="Survived",x="Cabin",data=dataset[:train_len],kind="bar",order=['A','B','C','D','E','F','G','T','X'])
g = g.set_ylabels("Survival Probability")
dataset = pd.get_dummies(dataset, columns = ["Cabin"],prefix="Cabin")
dataset.drop(['Cabin_T'],axis=1,inplace=True)
dataset["Ticket"].head()
## Treat Ticket by extracting the ticket prefix. When there is no prefix it returns X.
Ticket = []
for i in list(dataset.Ticket):
if not i.isdigit() :
Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) #Take prefix
else:
Ticket.append("X")
dataset["Ticket"] = Ticket
dataset["Ticket"].head()
dataset[:train_len][['Ticket', 'Survived']].groupby(['Ticket'], as_index=False).mean().sort_values(by='Ticket', ascending=True)
dataset = pd.get_dummies(dataset, columns = ["Ticket"], prefix="T")
dataset.drop(['T_A4'],axis=1,inplace=True)
dataset[:train_len][['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Pclass', ascending=True)
# Create categorical values for Pclass
dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Pclass"],prefix="Pc")
dataset.drop(['Pc_3'],axis=1,inplace=True)
dataset['Age']=dataset['Age'].astype(int)
dataset['AgeBand'] = pd.cut(dataset['Age'], 5)
dataset[:train_len][['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)
dataset.tail()
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
Age_dummies = pd.get_dummies(dataset['Age'], prefix='Age')
dataset=dataset.join(Age_dummies).drop(['Age','AgeBand'],axis=1)
dataset.drop(['Age_4'],axis=1,inplace=True)
dataset['FareBand'] = pd.cut(dataset['Fare'], 4)
dataset[:train_len][['FareBand','Survived']].groupby(['FareBand'],as_index=False).mean().sort_values(by='FareBand', ascending=True)
dataset.columns
dataset.loc[ dataset['Fare'] <= 1.56, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 1.56) & (dataset['Fare'] <= 3.119), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 3.119) & (dataset['Fare'] <= 4.679), 'Fare'] = 2
dataset.loc[ dataset['Fare'] > 4.679, 'Fare'] = 3
Fare_dummies = pd.get_dummies(dataset['Fare'], prefix='Fare')
dataset = dataset.join(Fare_dummies).drop(['Fare','FareBand'],axis=1)
dataset.columns
dataset.drop(['Fare_0.0'],axis=1,inplace=True)
# Drop useless variables
dataset.drop(labels = ["PassengerId"], axis = 1, inplace = True)
dataset.head()
dataset.columns
## Separate train dataset and test dataset
train = dataset[:train_len]
test = dataset[train_len:]
test.drop(labels=["Survived"],axis = 1,inplace=True)
## Separate train features and label
train["Survived"] = train["Survived"].astype(int)
Y_train = train["Survived"]
X_train = train.drop(labels = ["Survived"],axis = 1)
# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)
k_range=list([16,18])
knn_param_grid={'n_neighbors' : k_range}
gridKNN = GridSearchCV(KNeighborsClassifier(),param_grid = knn_param_grid, cv=kfold, scoring="accuracy", n_jobs= -1, verbose = 1)
gridKNN.fit(X_train,Y_train)
print(gridKNN.best_estimator_)
print(gridKNN.best_score_)
LR_param_grid={'penalty' : ['l1', 'l2'], 'C' : [0.001,0.01,0.1,1,10,100]}
gridLR = GridSearchCV(LogisticRegression(),param_grid = LR_param_grid, cv=kfold, scoring="accuracy", n_jobs= -1, verbose = 1)
gridLR.fit(X_train,Y_train)
print(gridLR.best_estimator_)
print(gridLR.best_score_)
from sklearn.naive_bayes import GaussianNB
GaussianNB=GaussianNB()
GaussianNB.fit(X_train, Y_train)
NB_score=cross_val_score(GaussianNB,X_train,Y_train, cv = kfold,scoring = "accuracy").mean()
print(NB_score)
#不知怎么回事,计算有问题
C=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
gamma=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
kernel=['rbf','linear']
SVC_param_grid={'kernel':kernel,'C':C,'gamma':gamma}
gridSVC = GridSearchCV(SVC(),param_grid = SVC_param_grid, cv=kfold, scoring="accuracy", n_jobs= -1, verbose = 1)
gridSVC.fit(X_train,Y_train)
print(gridSVC.best_estimator_)
print(gridSVC.best_score_)
test_Survived = pd.Series(gridSVC.best_estimator_.predict(test), name="Survived")
results_SVC = pd.concat([IDtest,test_Survived],axis=1)
results_SVC.to_csv("SVC_predict.csv",index=False)
# RFC Parameters tunning
RFC = RandomForestClassifier()
## Search grid for optimal parameters
rf_param_grid = {"n_estimators" :[300, 500],
"max_depth": [8, 15],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 5],
"max_features": ['log2', 'sqrt']}
gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=5, scoring="accuracy", n_jobs= -1, verbose = 1)
gsRFC.fit(X_train,Y_train)
RFC_best = gsRFC.best_estimator_
# Best score
gsRFC.best_score_ ,RFC_best
test_Survived = pd.Series(RFC_best.predict(test), name="Survived")
results_RFC_best = pd.concat([IDtest,test_Survived],axis=1)
results_RFC_best.to_csv("RFC_best.csv",index=False)
#ExtraTrees
ExtC = ExtraTreesClassifier()
## Search grid for optimal parameters
ex_param_grid = {"max_depth": [8, 15],
"max_features": ['log2', 'sqrt'],
"min_samples_split": [2,5, 10],
"min_samples_leaf": [1, 2, 5],
"n_estimators" :[300, 500]}
gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=5, scoring="accuracy", n_jobs= -1, verbose = 1)
gsExtC.fit(X_train,Y_train)
ExtC_best = gsExtC.best_estimator_
# Best score
gsExtC.best_score_, ExtC_best
# Gradient boosting tunning
GBC = GradientBoostingClassifier()
gb_param_grid = {
'learning_rate': [0.1, 0.05, 0.01],
'max_depth': [3, 5, 10],
'min_samples_leaf': [50,100,150],
'max_features' :['sqrt','log2']
}
gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid,cv=5, scoring="accuracy", n_jobs= 4, verbose = 1)
gsGBC.fit(X_train,Y_train)
GBC_best = gsGBC.best_estimator_
# Best score
gsGBC.best_score_,GBC_best
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
## Search grid for optimal parameters
xgb_param_grid = {"learning_rate": [0.01,0.5,1.0],
"n_estimators" : [300,500],
"gamma": [0.1, 0.5,1.0],
"max_depth": [3, 5, 10],
"min_child_weight": [1, 3],
"subsample" : [0.8,1.0],
"colsample_bytree" : [0.8,1.0]}
gridxgb = GridSearchCV(XGBClassifier(),param_grid = xgb_param_grid, cv=5, scoring="accuracy", n_jobs= -1, verbose = 1)
gridxgb.fit(X_train,Y_train)
gridxgb_best = gridxgb.best_estimator_
# Best score
gridxgb.best_score_
print(gridxgb_best)
test_Survived = pd.Series(gridxgb_best.predict(test), name="Survived")
results_gridxgb = pd.concat([IDtest,test_Survived],axis=1)
results_gridxgb.to_csv("gridxgb.csv",index=False)
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
"""Generate a simple plot of the test and training learning curve"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
g = plot_learning_curve(gsRFC.best_estimator_,"RF mearning curves",X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsExtC.best_estimator_,"ExtraTrees learning curves",X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsGBC.best_estimator_,"GradientBoosting learning curves",X_train,Y_train,cv=kfold)
g = plot_learning_curve(gridxgb.best_estimator_,"XGBoost learning curves",X_train,Y_train,cv=kfold)
nrows = ncols = 2
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,15))
names_classifiers = [("ExtraTrees",ExtC_best),("RandomForest",RFC_best),("GradientBoosting",GBC_best),("XGBoost", gridxgb_best)]
nclassifier = 0
for row in range(nrows):
for col in range(ncols):
name = names_classifiers[nclassifier][0]
classifier = names_classifiers[nclassifier][1]
indices = np.argsort(classifier.feature_importances_)[::-1][:40]
g = sns.barplot(y=X_train.columns[indices][:40],x = classifier.feature_importances_[indices][:40] , orient='h',ax=axes[row][col])
g.set_xlabel("Relative importance",fontsize=12)
g.set_ylabel("Features",fontsize=12)
g.tick_params(labelsize=9)
g.set_title(name + " feature importance")
nclassifier += 1
test_Survived_RFC = pd.Series(RFC_best.predict(test), name="RFC")
test_Survived_ExtC = pd.Series(ExtC_best.predict(test), name="ExtC")
test_Survived_GBC = pd.Series(GBC_best.predict(test), name="GBC")
test_Survived_xgb = pd.Series(gridxgb_best.predict(test), name="xgb")
# Concatenate all classifier results
ensemble_results = pd.concat([test_Survived_RFC,test_Survived_ExtC,test_Survived_GBC, test_Survived_xgb],axis=1)
g= sns.heatmap(ensemble_results.corr(),annot=True)
votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best),
('gbc',GBC_best), ('xgb', gridxgb_best)], voting='soft', n_jobs=-1)
votingC = votingC.fit(X_train, Y_train)
test_Survived = pd.Series(votingC.predict(test), name="Survived")
results_votingC = pd.concat([IDtest,test_Survived],axis=1)
results_votingC.to_csv("ensemble_python_voting.csv",index=False)
#第一层
class Ensemble_stacking1(object):
def __init__(self, n_folds, base_models):
self.n_folds = n_folds
self.base_models = base_models
def get_data_to2(self, X, y, T):
X = np.array(X)
y = np.array(y)
T = np.array(T)
folds = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=2016).split(X,y)
S_train = np.zeros((X.shape[0], len(self.base_models)))
S_test = np.zeros((T.shape[0], len(self.base_models)))
for i, clf in enumerate(self.base_models):
S_test_i = np.zeros((T.shape[0], self.n_folds))
for j, (train_idx, test_idx) in enumerate(folds):
X_train = X[train_idx]
y_train = y[train_idx]
X_holdout = X[test_idx]
# y_holdout = y[test_idx]
clf.fit(X_train, y_train)
y_pred = clf.predict(X_holdout)[:]
S_train[test_idx, i] = y_pred
S_test_i[:, j] = clf.predict(T)[:]
S_test[:, i] = S_test_i.mean(1)
return S_train, S_test
#第二层
xgb2_param_grid = {"learning_rate": [0.01,0.5],
"n_estimators" : [300,500],
"gamma": [0.1, 0.5,1.0],
"max_depth": [3, 5, 10],
"min_child_weight": [1, 3 , 5, 7],
"subsample" : [0.8,1.0],
"colsample_bytree" : [0.6,0.8]}
gridxgb2 = GridSearchCV(XGBClassifier(),param_grid = xgb2_param_grid, cv=5, scoring="accuracy", n_jobs= -1, verbose = 1)
S_train, S_test = Ensemble_stacking1(5, [RFC_best, ExtC_best, GBC_best, gridxgb]).get_data_to2(X_train, Y_train, test)
gridxgb2.fit(S_train,Y_train)
gridxgb2_best = gridxgb2.best_estimator_
print(gridxgb2.best_score_)
test_Survived = pd.Series(gridxgb2.predict(S_test), name="Survived")
results_stacking = pd.concat([IDtest,test_Survived],axis=1)
results_stacking.to_csv("ensemble_python_stacking.csv",index=False)