kaggle--Titanic学习

# -*-coding:utf-8-*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import *


#2.1各乘客等级的获救情况
def survived_by_pclass(data_train):
    
    #fig = plt.figure()
    #fig.set(alpha = 0.2)
    
    survived_0 = data_train.Pclass[data_train.Survived==0].value_counts()
    survived_1 = data_train.Pclass[data_train.Survived==1].value_counts()
    
    df = pd.DataFrame({'获救':survived_1, '未获救':survived_0})
    df.plot(kind='bar', stacked = True)
    plt.title('各乘客等级的获救情况')
    plt.xlabel('乘客等级')
    plt.ylabel('人数')
    plt.show()
#2.2按性别查看获救情况  
def sysnax_data_by_sex(data_train):
    #fig = plt.figure()
    #fig.set(alpha=0.2)
    
    survived_m = data_train.Survived[data_train.Sex=='male'].value_counts()
    survived_f = data_train.Survived[data_train.Sex=='female'].value_counts()
    df=pd.DataFrame({'男性':survived_m, '女性':survived_f})
    df.plot(kind='bar',stacked=True)
    plt.title('按性别查看获救情况')
    plt.xlabel('性别')
    plt.ylabel('人数')
    plt.show()
    
#2.3各种舱级别下的各性别获救情况
def sysnax_data_by_Pclass_sex(data_train):
    fig = plt.figure()
    fig.set(alpha=0.65)
    
    plt.title('根据舱等级和性别的获救情况')
    ax1 = fig.add_subplot(141)
    data_train.Survived[data_train.Sex=='female'][data_train.Pclass!=3].value_counts().plot(kind='bar', label="female highclass", color='#FA2479')
    ax1.set_xticklabels(['获救','未获救'], rotation=0)
    ax1.legend(['女性/高级舱'],loc='best')
    
    ax2=fig.add_subplot(142, sharey=ax1)
    data_train.Survived[data_train.Sex=='female'][data_train.Pclass==3].value_counts().plot(kind='bar', label='female, low class', color='pink')
    ax2.set_xticklabels(['获救','未获救'], rotation=0)
    ax2.legend(['女性/低级舱'],loc='best')
    
    ax3=fig.add_subplot(143, sharey=ax1)
    data_train.Survived[data_train.Sex=='male'][data_train.Pclass!=3].value_counts().plot(kind='bar', label='male, high class',color='lightblue')
    ax3.set_xticklabels(['男性/高级舱'], rotation=0)
    ax3.legend('男性/高级舱',loc='best')
    
    ax4=fig.add_subplot(144, sharey=ax1)
    data_train.Survived[data_train.Sex=='male'][data_train.Pclass==3].value_counts().plot(kind='bar', label='male low class', color='steelblue')
    ax4.set_xticklabels(['获救','未获救'],rotation=0)
    ax4.legend(['男性/低级舱'],loc='best')
    
    plt.show()
    
#2.4各登船港口获救情况
def sysnax_data_by_embarked(data_train):
    
    #fig = plt.figure()
    #fig.set(alpha=0.2)
    survived_0 = data_train.Embarked[data_train.Survived==0].value_counts()
    survived_1 = data_train.Embarked[data_train.Survived==1].value_counts()
    
    df=pd.DataFrame({'获救':survived_1,'未获救':survived_0})
    df.plot(kind='bar',stacked=True)
    plt.title('各登陆港口获救情况')
    plt.xlabel('登陆港口')
    plt.ylabel('获救人数')
    
    plt.show()
    
#2.5堂兄、妹,孩子父母有几人对获救情况的影响
def sysnax_data_by_sibsp(data_train):
    
    g = data_train.groupby(['SibSp','Survived'])
    df = pd.DataFrame(g.count()['PassengerId'])
    print (df)
    
    g = data_train.groupby(['Parch','Survived'])
    df = pd.DataFrame(g.count()['PassengerId'])
    print(df)
    
#2.6按cabin分析获救情况
def sysnax_data_by_cabin(data_train):
    #fig = plt.figure()
    #fig.set(alpha=0.2)
    
    survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()
    survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()
    
    df = pd.DataFrame({'有':survived_cabin,'没有':survived_nocabin}).transpose()
    df.plot(kind='bar',stacked=True)
    plt.title('按Cabin查看是否获救情况')
    plt.xlabel('Cabin有无')
    plt.ylabel('人数')
    
    plt.show()
    
def sysnax_data_by_plot(data_train):
    
    #1.乘客的各属性分布
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    fig = plt.figure()
    fig.set(alpha = 0.2)
    
    plt.subplot2grid((2,3),(0,0))
    data_train.Survived.value_counts().plot(kind='bar')
    plt.title('1 survied')
    plt.ylabel('people num')
    
    plt.subplot2grid((2,3),(0,1))
    data_train.Pclass.value_counts().plot(kind='bar')
    plt.title('pclass')
    plt.ylabel('num')
    
    plt.subplot2grid((2,3),(0,2))
    plt.scatter(data_train.Survived,data_train.Age)
    plt.title('age/survived')
    plt.ylabel('age')
    plt.grid(b=True, which='major', axis='y')
    
    plt.subplot2grid((2,3),(1,0))
    data_train.Age[data_train.Pclass==1].plot(kind='kde')
    data_train.Age[data_train.Pclass==2].plot(kind='kde')
    data_train.Age[data_train.Pclass==3].plot(kind='kde')
    plt.title('各等级的年龄分布')
    plt.xlabel('年龄')
    plt.ylabel('密度')
    plt.legend(('头等舱','2等舱','3等舱'),loc='best')
    
    plt.subplot2grid((2,3),(1,1))
    data_train.Embarked.value_counts().plot(kind='bar')
    plt.title('各登岸口上船人数')
    plt.ylabel('人数')
    
    #plt.show()
    
    #2.属性与获救结果的统计
    #2.1各乘客等级的获救情况
    #survived_by_pclass(data_train)
    
    #2.2按性别查看获救情况
    #sysnax_data_by_sex(data_train)
    
    #2.3各种舱级别下的各性别获救情况
    #sysnax_data_by_Pclass_sex(data_train)
    
    #2.4各登船港口获救情况
    #sysnax_data_by_embarked(data_train)
    
    #2.5堂兄、妹,孩子父母有几人对获救情况的影响
    #sysnax_data_by_sibsp(data_train)
    
    #2.6按cabin分析获救情况
    #sysnax_data_by_cabin(data_train)
    
from sklearn.ensemble import RandomForestRegressor   
# 3.1使用RandomForestRegressor填补缺失的年龄属性
def set_missing_ages(df):
    #1.把已有的特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare','Parch','SibSp','Pclass']]
    
    #2.乘客分为有年龄和无年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()
    
    #3.y即目标年龄
    y = known_age[:,0]
    
    #4.X即特征属性值
    X = known_age[:,1:]
    
    #5.fit到RandomForestRegressor中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2, n_jobs=-1)
    rfr.fit(X,y)
    
    #6.用得到的模型进行年龄预测
    predict_ages = rfr.predict(unknown_age[:,1::])
    
    #7.用得到的数据填补原数据
    df.loc[(df.Age.isnull()),'Age'] = predict_ages
    
    return df,rfr
    
#3.2填补Cabin数据
def set_cabin_type(data_train):
    data_train.loc[(data_train.Cabin.notnull()),'Cabin'] = 'Yes'
    data_train.loc[(data_train.Cabin.isnull()),'Cabin'] = 'No'
    
    return data_train
#3.3逻辑回归建模时,要求输入的特征都是数值性特征,因此将类目类型进行特征因子化
def set_feature_num(data_train):
    dummies_cabin = pd.get_dummies(data_train['Cabin'], prefix = 'Cabin')
    dummies_embarked = pd.get_dummies(data_train['Embarked'], prefix = 'Embarked')
    dummies_sex = pd.get_dummies(data_train['Sex'], prefix = 'Sex')
    dummies_pclass = pd.get_dummies(data_train['Pclass'], prefix = 'Pclass')
    
    data_train = pd.concat([data_train, dummies_cabin,dummies_embarked,dummies_sex,dummies_pclass],axis=1)
    data_train.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],axis=1, inplace = True)
    
    
    return data_train
  
from sklearn.preprocessing import StandardScaler
#3.4将年龄与fare标准化
def set_feature_normalized (data_train):
    ss = StandardScaler()
    
    data_train['Age'] = ss.fit_transform(data_train['Age'].values.reshape(-1,1))
    data_train['Fare'] = ss.fit_transform(data_train['Fare'].values.reshape(-1,1))
    #age_scaler = scaler.fit(data_train['Age'])
    #data_train['Age_scaled'] = scaler.fit_transform(data_train['Age'],age_scaler)
    
    #fare_scaler = scaler.fit(data_train['Fare'])
    #data_train['Fare_scaled'] = scaler.fit_transform(data_train['Fare',fare_scaler])
    return data_train
    
    
# 3.简单数据预处理
def pre_deal(data_train):
    
    #3.1使用RandomForestRegressor填补缺失的年龄属性
    data_train,rfr = set_missing_ages(data_train)
    
    #3.2填补Cabin数据
    data_train = set_cabin_type(data_train)
    
    #3.3逻辑回归建模时,要求输入的特征都是数值性特征,因此将类目类型进行特征因子化
    data_train = set_feature_num(data_train)
    
    #3.4将年龄与fare归一
    data_train=set_feature_normalized(data_train)
    
    print((data_train.head()))
    return data_train,rfr

from sklearn import linear_model
# 4.逻辑回归建模
def build_model_by_logic(data_train):
    #1.用正则取出用到的属性
    train_df = data_train.filter(regex='Survived|Age*|SibSp|Parch|Fare*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    train_np = train_df.as_matrix()
    
    #2.y labels
    y = train_np[:,0]
    
    #3.x 特征属性值
    X = train_np[:,1:]
    
    #4逻辑回归
    clf = linear_model.LogisticRegression(C=1.0, penalty = 'l1', tol=1e-6)
    clf.fit(X,y)
    
    #系数为正,与结果正相关
    #print(list(train_df.columns)[1:])
    #print(list(clf.coef_.T))
    a = pd.DataFrame({'columns':list(train_df.columns)[1:], 'coef':list(clf.coef_.T)})
    print (a)            
    
    
    return clf
    
    #pd.DataFrame(train_np).to_csv('tmp.csv')
    #print(train_np)
    
    
    
# 5.预测模型
def predict_by_logic(clf,rfr, data_train):
    # 1.read data using pandas
    data_test = pd.read_csv(r'test.csv')
    
    data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
    
    tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
    null_age = tmp_df[data_test.Age.isnull()].as_matrix()
    
    X = null_age[:, 1:]
    predictedAges = rfr.predict(X)
    data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges
    
    data_test = set_cabin_type(data_test)
    dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
    dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
    dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
    dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
    
    
    df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
    df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
    
    ss = StandardScaler()
    df_test['Age'] = ss.fit_transform(df_test['Age'].values.reshape(-1,1))
    df_test['Fare'] = ss.fit_transform(df_test['Fare'].values.reshape(-1,1))
    
    
    #4.用正则取出用到的属性
    test_df = df_test.filter(regex='Survived|Age*|SibSp|Parch|Fare*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    
    #5预测
    predict_result = clf.predict(test_df)
    
    result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predict_result.astype(np.int32)})
    result.to_csv('predict_result.csv',index=False)
    
    
    
   
if __name__=='__main__':
    
    # 1.read data using pandas
    data_train = pd.read_csv(r'train.csv')
    print(data_train.head())
    #print(data_train.info())
    #print (data_train.describe())
    
    # 2.sysnax data using view
    #sysnax_data_by_plot(data_train)
    
    # 3.简单数据预处理
    data_train,rfr = pre_deal(data_train)
    
    # 4.逻辑回归建模
    clf = build_model_by_logic(data_train)
    
    # 5.预测模型
    predict_by_logic(clf,rfr,data_train)
    
    
    

你可能感兴趣的:(python,ai,kaggle,titanic,机器学习,逻辑回归)