import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import Counter
import gc
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import (RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier)
from sklearn.svm import SVC
#from sklearn.cross_validation import KFold
#这个cross_validatio这个包早就不在使用了,划分到了model_selection这个包中。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print('the train shape is:',train.shape,',the test shape is:',test.shape)
train.head()
Passengerid = test['PassengerId']
full_data = [train,test]
Counter(train.dtypes.values)
train.dtypes
train['Name_length']=train['Name'].apply(len)
test['Name_length']=test['Name'].apply(len)
train['Has_Cabin']= train['Cabin'].apply(lambda x:0 if type(x) == float else 1)
test['Has_Cabin']= test['Cabin'].apply(lambda x:0 if type(x) == float else 1)
train.Embarked.unique()
#是否单身
for dataset in full_data:
dataset['IsAlone'] = 0
dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
#缺失值替换 -众数
for dataset in full_data:
dataset['Embarked'] = dataset['Embarked'].fillna('S')
#缺失值替换-中位数
for dataset in full_data:
dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
#对票价进行分箱-----每个箱子的数量值相同
train['CategoricalFare']=pd.qcut(train['Fare'], 4)
train['CategoricalFare'].unique()
for dataset in full_data:
age_avg = dataset['Age'].mean()
age_std = dataset['Age'].std()
age_null_count = dataset['Age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std,age_avg+age_std,size = age_null_count)
dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
dataset['Age'] = dataset['Age'].astype(int)
#对年龄分箱 --等距分箱
train['CageoricalAge'] = pd.cut(train['Age'],5)
train['CageoricalAge'].unique()
#姓名处理
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
if title_search:
return title_search.group(1)
return ""
for dataset in full_data:
dataset['Title'] = dataset['Name'].apply(get_title)
for dataset in full_data:
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
'Rare')
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
for dataset in full_data:
dataset['Sex'] = dataset['Sex'].map({'female':0, 'male': 1}).astype(int)
# title_mapping = {}
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
dataset['Title'] = dataset['Title'].map(title_mapping)
dataset['Title'] = dataset['Title'].fillna(0)
#Embarked编码
dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
#Fare 编码
dataset.loc[dataset['Fare']<=7.91,'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
dataset['Fare'] = dataset['Fare'].astype(int)
#Age编码
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
#特征选择
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements,axis = 1)
train = train.drop(['CageoricalAge','CategoricalFare'],axis = 1)
test = test.drop(drop_elements,axis= 1 )
#特征直接的相关性
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('特征的相关性', y=1.05,size = 15)
sns.heatmap(train.astype(float).corr(),linewidths = 0.1,vmax = 1.0,square = True,cmap = colormap,linecolor = 'white', annot = True)
#分割训练集和测试集
X=train
y=y_train
seed=100
Xtrain,Xtest,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=seed)
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
scores = cross_val_score(clf, X, y, cv=5)
scores.mean()
clf.fit(X,y)
from sklearn import metrics
y_pred=clf.predict(Xtest)
print("Accuracy:%.4g"% metrics.accuracy_score(y_test,y_pred))
print("AUC Score(Train):%f"% metrics.roc_auc_score(y_test,y_pred))
metrics.confusion_matrix(y_test,y_pred)
metrics.recall_score(y_test,y_pred,average='binary')
clf = RandomForestClassifier(n_estimators=500, max_depth=6, min_samples_split=2, random_state=0)
scores = cross_val_score(clf, X, y, cv=5)
scores.mean()