import pandas as pd
data_train = pd.read_csv(r'train.csv')
data_test = pd.read_csv(r'test.csv')
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
def simplify_ages(df):
df.Age = df.Age.fillna(-0.5)
bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
catagories = pd.cut(df.Age,bins,labels=group_names)
df.Age = catagories
return df
def simplify_cabin(df):
df.Cabin = df.Cabin.fillna('N')
df.Cabin = df.Cabin.apply(lambda x:x[0])
return df
def simplify_fare(df):
df.Fare = df.Fare.fillna(-0.5)
bins = (-1, 0, 8, 15, 31, 1000)
group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
catagories = pd.cut(df.Fare,bins,labels=group_names)
df.Fare = catagories
return df
def simplify_drop(df):
return df.drop(['Name','Ticket','Embarked'],axis=1)
def transform_features(df):
df = simplify_ages(df)
df = simplify_cabin(df)
df = simplify_fare(df)
df = simplify_drop(df)
return df
data_train = pd.read_csv(r'train.csv')
data_train = transform_features(data_train)
data_test = transform_features(data_test)
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Cabin | |
0 | 1 | 0 | 3 | male | Student | 1 | 0 | 1_quartile | N |
1 | 2 | 1 | 1 | female | Adult | 1 | 0 | 4_quartile | C |
2 | 3 | 1 | 3 | female | Young Adult | 0 | 0 | 1_quartile | N |
3 | 4 | 1 | 1 | female | Young Adult | 1 | 0 | 4_quartile | C |
4 | 5 | 0 | 3 | male | Young Adult | 0 | 0 | 2_quartile | N |
选取我们需要的那几个列作为输入, 对于票价和姓名我就舍弃了,姓名没什么用
PassengerId 0
Pclass 0
Sex 0
Age 86
SibSp 0
Parch 0
Fare 1
Embarked 0
dtype: int64
age_mean = data_tr['Age'].mean()
data_tr['Age'] = data_tr['Age'].fillna(age_mean)
data_tr['Embarked'] = data_tr['Embarked'].fillna('S')
PassengerId 0
Survived 0
Pclass 0
Sex 0
Age 0
SibSp 0
Parch 0
Fare 0
Embarked 0
dtype: int64
用数组特征化编码年龄和S C Q等等,,因为随机森林的输入需要数值,字符不行
#import numpy as np
age_mean = data_te['Age'].mean()
data_te['Age'] = data_te['Age'].fillna(age_mean)
age_mean = data_te['Fare'].mean()
data_te['Fare'] = data_te['Fare'].fillna(age_mean)
#data_te.replace(np.na, 0, inplace=True)
#data_te.replace(np.inf, 0, inplace=True)
data_te['Sex']= data_te['Sex'].map({'female':0, 'male': 1}).astype(int)
data_te['Embarked']= data_te['Embarked'].map({'S':0, 'C': 1,'Q':2}).astype(int)
data_tr['Sex']= data_tr['Sex'].map({'female':0, 'male': 1}).astype(int)
data_tr['Embarked']= data_tr['Embarked'].map({'S':0, 'C': 1,'Q':2}).astype(int)
#data_tr = pd.get_dummies(data_tr=data_tr,columns=['Embarked'])
sns.barplot(x = 'Age',y = 'Survived',hue='Sex',data = data_train)
sns.barplot(x = 'Cabin',y = 'Survived',hue='Sex',data = data_train)
sns.barplot(x = 'Fare',y = 'Survived',hue='Sex',data = data_train)
from sklearn.model_selection import train_test_split
X_all = data_tr.drop(['PassengerId','Survived'],axis=1)#主要是乘客ID也没啥用,删就删了吧
y_all = data_tr['Survived']
p = 0.2 #用 百分之20作为测试集
X_train,X_test, y_train, y_test = train_test_split(X_all,y_all,test_size=p, random_state=23)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
clf = RandomForestClassifier()
parameters = {'n_estimators': [4, 6, 9],
'max_features': ['log2', 'sqrt','auto'],
'criterion': ['entropy', 'gini'], #分类标准用熵,基尼系数
'max_depth': [2, 3, 5, 10],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1,5,8]
acc_scorer = make_scorer(accuracy_score)
grid_obj = GridSearchCV(clf,parameters,scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train,y_train)
clf = grid_obj.best_estimator_
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
max_depth=5, max_features='sqrt', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=3,
min_weight_fraction_leaf=0.0, n_estimators=4,
n_jobs=None, oob_score=False, random_state=None,
predictions = clf.predict(X_test)
predictions = clf.predict(data_te.drop('PassengerId',axis=1))
output = pd.DataFrame({'Passengers':data_te['PassengerId'],'Survived':predictions})
Passengers | Survived | |
0 | 892 | 0 |
1 | 893 | 0 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 0 |
结果是 0.77990