TEST

import pandas as pd
import numpy as np
import re
import sklearn

import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import Counter

import gc

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import (RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier)

from sklearn.svm import SVC
#from sklearn.cross_validation import KFold
#这个cross_validatio这个包早就不在使用了,划分到了model_selection这个包中。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print('the train shape is:',train.shape,',the test shape is:',test.shape)
train.head()

Passengerid = test['PassengerId']

full_data = [train,test]

Counter(train.dtypes.values)

train.dtypes

train['Name_length']=train['Name'].apply(len)
test['Name_length']=test['Name'].apply(len)

train['Has_Cabin']= train['Cabin'].apply(lambda x:0 if type(x) == float else 1)
test['Has_Cabin']= test['Cabin'].apply(lambda x:0 if type(x) == float else 1)

train.Embarked.unique()

#是否单身
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

#缺失值替换 -众数
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

#缺失值替换-中位数
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

#对票价进行分箱-----每个箱子的数量值相同
train['CategoricalFare']=pd.qcut(train['Fare'], 4)

train['CategoricalFare'].unique()

for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std,age_avg+age_std,size = age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
#对年龄分箱 --等距分箱
train['CageoricalAge'] = pd.cut(train['Age'],5)

train['CageoricalAge'].unique()

#姓名处理
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
                                                'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    dataset['Sex'] = dataset['Sex'].map({'female':0, 'male': 1}).astype(int)
   # title_mapping = {}
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    #Embarked编码
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
    #Fare 编码
    dataset.loc[dataset['Fare']<=7.91,'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare']  = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    #Age编码
    dataset.loc[ dataset['Age'] <= 16, 'Age']  = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

#特征选择
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements,axis = 1)
train = train.drop(['CageoricalAge','CategoricalFare'],axis = 1)
test = test.drop(drop_elements,axis= 1 )

#特征直接的相关性
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('特征的相关性', y=1.05,size = 15)
sns.heatmap(train.astype(float).corr(),linewidths = 0.1,vmax = 1.0,square = True,cmap = colormap,linecolor = 'white', annot = True)

#分割训练集和测试集
X=train
y=y_train
seed=100
Xtrain,Xtest,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=seed)

clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
scores = cross_val_score(clf, X, y, cv=5)
scores.mean()

clf.fit(X,y)

from sklearn import metrics

y_pred=clf.predict(Xtest)
print("Accuracy:%.4g"% metrics.accuracy_score(y_test,y_pred))
print("AUC Score(Train):%f"% metrics.roc_auc_score(y_test,y_pred))
metrics.confusion_matrix(y_test,y_pred)

metrics.recall_score(y_test,y_pred,average='binary') 

clf = RandomForestClassifier(n_estimators=500, max_depth=6, min_samples_split=2, random_state=0)
scores = cross_val_score(clf, X, y, cv=5)
scores.mean()

你可能感兴趣的:(kaggle)