# -*-coding:utf-8-*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
#2.1各乘客等级的获救情况
def survived_by_pclass(data_train):
#fig = plt.figure()
#fig.set(alpha = 0.2)
survived_0 = data_train.Pclass[data_train.Survived==0].value_counts()
survived_1 = data_train.Pclass[data_train.Survived==1].value_counts()
df = pd.DataFrame({'获救':survived_1, '未获救':survived_0})
df.plot(kind='bar', stacked = True)
plt.title('各乘客等级的获救情况')
plt.xlabel('乘客等级')
plt.ylabel('人数')
plt.show()
#2.2按性别查看获救情况
def sysnax_data_by_sex(data_train):
#fig = plt.figure()
#fig.set(alpha=0.2)
survived_m = data_train.Survived[data_train.Sex=='male'].value_counts()
survived_f = data_train.Survived[data_train.Sex=='female'].value_counts()
df=pd.DataFrame({'男性':survived_m, '女性':survived_f})
df.plot(kind='bar',stacked=True)
plt.title('按性别查看获救情况')
plt.xlabel('性别')
plt.ylabel('人数')
plt.show()
#2.3各种舱级别下的各性别获救情况
def sysnax_data_by_Pclass_sex(data_train):
fig = plt.figure()
fig.set(alpha=0.65)
plt.title('根据舱等级和性别的获救情况')
ax1 = fig.add_subplot(141)
data_train.Survived[data_train.Sex=='female'][data_train.Pclass!=3].value_counts().plot(kind='bar', label="female highclass", color='#FA2479')
ax1.set_xticklabels(['获救','未获救'], rotation=0)
ax1.legend(['女性/高级舱'],loc='best')
ax2=fig.add_subplot(142, sharey=ax1)
data_train.Survived[data_train.Sex=='female'][data_train.Pclass==3].value_counts().plot(kind='bar', label='female, low class', color='pink')
ax2.set_xticklabels(['获救','未获救'], rotation=0)
ax2.legend(['女性/低级舱'],loc='best')
ax3=fig.add_subplot(143, sharey=ax1)
data_train.Survived[data_train.Sex=='male'][data_train.Pclass!=3].value_counts().plot(kind='bar', label='male, high class',color='lightblue')
ax3.set_xticklabels(['男性/高级舱'], rotation=0)
ax3.legend('男性/高级舱',loc='best')
ax4=fig.add_subplot(144, sharey=ax1)
data_train.Survived[data_train.Sex=='male'][data_train.Pclass==3].value_counts().plot(kind='bar', label='male low class', color='steelblue')
ax4.set_xticklabels(['获救','未获救'],rotation=0)
ax4.legend(['男性/低级舱'],loc='best')
plt.show()
#2.4各登船港口获救情况
def sysnax_data_by_embarked(data_train):
#fig = plt.figure()
#fig.set(alpha=0.2)
survived_0 = data_train.Embarked[data_train.Survived==0].value_counts()
survived_1 = data_train.Embarked[data_train.Survived==1].value_counts()
df=pd.DataFrame({'获救':survived_1,'未获救':survived_0})
df.plot(kind='bar',stacked=True)
plt.title('各登陆港口获救情况')
plt.xlabel('登陆港口')
plt.ylabel('获救人数')
plt.show()
#2.5堂兄、妹,孩子父母有几人对获救情况的影响
def sysnax_data_by_sibsp(data_train):
g = data_train.groupby(['SibSp','Survived'])
df = pd.DataFrame(g.count()['PassengerId'])
print (df)
g = data_train.groupby(['Parch','Survived'])
df = pd.DataFrame(g.count()['PassengerId'])
print(df)
#2.6按cabin分析获救情况
def sysnax_data_by_cabin(data_train):
#fig = plt.figure()
#fig.set(alpha=0.2)
survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()
survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()
df = pd.DataFrame({'有':survived_cabin,'没有':survived_nocabin}).transpose()
df.plot(kind='bar',stacked=True)
plt.title('按Cabin查看是否获救情况')
plt.xlabel('Cabin有无')
plt.ylabel('人数')
plt.show()
def sysnax_data_by_plot(data_train):
#1.乘客的各属性分布
mpl.rcParams['font.sans-serif'] = ['SimHei']
fig = plt.figure()
fig.set(alpha = 0.2)
plt.subplot2grid((2,3),(0,0))
data_train.Survived.value_counts().plot(kind='bar')
plt.title('1 survied')
plt.ylabel('people num')
plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind='bar')
plt.title('pclass')
plt.ylabel('num')
plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived,data_train.Age)
plt.title('age/survived')
plt.ylabel('age')
plt.grid(b=True, which='major', axis='y')
plt.subplot2grid((2,3),(1,0))
data_train.Age[data_train.Pclass==1].plot(kind='kde')
data_train.Age[data_train.Pclass==2].plot(kind='kde')
data_train.Age[data_train.Pclass==3].plot(kind='kde')
plt.title('各等级的年龄分布')
plt.xlabel('年龄')
plt.ylabel('密度')
plt.legend(('头等舱','2等舱','3等舱'),loc='best')
plt.subplot2grid((2,3),(1,1))
data_train.Embarked.value_counts().plot(kind='bar')
plt.title('各登岸口上船人数')
plt.ylabel('人数')
#plt.show()
#2.属性与获救结果的统计
#2.1各乘客等级的获救情况
#survived_by_pclass(data_train)
#2.2按性别查看获救情况
#sysnax_data_by_sex(data_train)
#2.3各种舱级别下的各性别获救情况
#sysnax_data_by_Pclass_sex(data_train)
#2.4各登船港口获救情况
#sysnax_data_by_embarked(data_train)
#2.5堂兄、妹,孩子父母有几人对获救情况的影响
#sysnax_data_by_sibsp(data_train)
#2.6按cabin分析获救情况
#sysnax_data_by_cabin(data_train)
from sklearn.ensemble import RandomForestRegressor
# 3.1使用RandomForestRegressor填补缺失的年龄属性
def set_missing_ages(df):
#1.把已有的特征取出来丢进Random Forest Regressor中
age_df = df[['Age','Fare','Parch','SibSp','Pclass']]
#2.乘客分为有年龄和无年龄两部分
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
#3.y即目标年龄
y = known_age[:,0]
#4.X即特征属性值
X = known_age[:,1:]
#5.fit到RandomForestRegressor中
rfr = RandomForestRegressor(random_state=0, n_estimators=2, n_jobs=-1)
rfr.fit(X,y)
#6.用得到的模型进行年龄预测
predict_ages = rfr.predict(unknown_age[:,1::])
#7.用得到的数据填补原数据
df.loc[(df.Age.isnull()),'Age'] = predict_ages
return df,rfr
#3.2填补Cabin数据
def set_cabin_type(data_train):
data_train.loc[(data_train.Cabin.notnull()),'Cabin'] = 'Yes'
data_train.loc[(data_train.Cabin.isnull()),'Cabin'] = 'No'
return data_train
#3.3逻辑回归建模时,要求输入的特征都是数值性特征,因此将类目类型进行特征因子化
def set_feature_num(data_train):
dummies_cabin = pd.get_dummies(data_train['Cabin'], prefix = 'Cabin')
dummies_embarked = pd.get_dummies(data_train['Embarked'], prefix = 'Embarked')
dummies_sex = pd.get_dummies(data_train['Sex'], prefix = 'Sex')
dummies_pclass = pd.get_dummies(data_train['Pclass'], prefix = 'Pclass')
data_train = pd.concat([data_train, dummies_cabin,dummies_embarked,dummies_sex,dummies_pclass],axis=1)
data_train.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],axis=1, inplace = True)
return data_train
from sklearn.preprocessing import StandardScaler
#3.4将年龄与fare标准化
def set_feature_normalized (data_train):
ss = StandardScaler()
data_train['Age'] = ss.fit_transform(data_train['Age'].values.reshape(-1,1))
data_train['Fare'] = ss.fit_transform(data_train['Fare'].values.reshape(-1,1))
#age_scaler = scaler.fit(data_train['Age'])
#data_train['Age_scaled'] = scaler.fit_transform(data_train['Age'],age_scaler)
#fare_scaler = scaler.fit(data_train['Fare'])
#data_train['Fare_scaled'] = scaler.fit_transform(data_train['Fare',fare_scaler])
return data_train
# 3.简单数据预处理
def pre_deal(data_train):
#3.1使用RandomForestRegressor填补缺失的年龄属性
data_train,rfr = set_missing_ages(data_train)
#3.2填补Cabin数据
data_train = set_cabin_type(data_train)
#3.3逻辑回归建模时,要求输入的特征都是数值性特征,因此将类目类型进行特征因子化
data_train = set_feature_num(data_train)
#3.4将年龄与fare归一
data_train=set_feature_normalized(data_train)
print((data_train.head()))
return data_train,rfr
from sklearn import linear_model
# 4.逻辑回归建模
def build_model_by_logic(data_train):
#1.用正则取出用到的属性
train_df = data_train.filter(regex='Survived|Age*|SibSp|Parch|Fare*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()
#2.y labels
y = train_np[:,0]
#3.x 特征属性值
X = train_np[:,1:]
#4逻辑回归
clf = linear_model.LogisticRegression(C=1.0, penalty = 'l1', tol=1e-6)
clf.fit(X,y)
#系数为正,与结果正相关
#print(list(train_df.columns)[1:])
#print(list(clf.coef_.T))
a = pd.DataFrame({'columns':list(train_df.columns)[1:], 'coef':list(clf.coef_.T)})
print (a)
return clf
#pd.DataFrame(train_np).to_csv('tmp.csv')
#print(train_np)
# 5.预测模型
def predict_by_logic(clf,rfr, data_train):
# 1.read data using pandas
data_test = pd.read_csv(r'test.csv')
data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
X = null_age[:, 1:]
predictedAges = rfr.predict(X)
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges
data_test = set_cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
ss = StandardScaler()
df_test['Age'] = ss.fit_transform(df_test['Age'].values.reshape(-1,1))
df_test['Fare'] = ss.fit_transform(df_test['Fare'].values.reshape(-1,1))
#4.用正则取出用到的属性
test_df = df_test.filter(regex='Survived|Age*|SibSp|Parch|Fare*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
#5预测
predict_result = clf.predict(test_df)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predict_result.astype(np.int32)})
result.to_csv('predict_result.csv',index=False)
if __name__=='__main__':
# 1.read data using pandas
data_train = pd.read_csv(r'train.csv')
print(data_train.head())
#print(data_train.info())
#print (data_train.describe())
# 2.sysnax data using view
#sysnax_data_by_plot(data_train)
# 3.简单数据预处理
data_train,rfr = pre_deal(data_train)
# 4.逻辑回归建模
clf = build_model_by_logic(data_train)
# 5.预测模型
predict_by_logic(clf,rfr,data_train)