import pandas as pa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
filename = "train.csv"
titanic = pa.read_csv(filename)
titanic.head()
3.观察数据的简单数据特征
print titanic.describe()
结果:
PassengerId Survived Pclass Age SibSp \
count 891.000000 891.000000 891.000000 714.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008
std 257.353842 0.486592 0.836071 14.526497 1.102743
min 1.000000 0.000000 1.000000 0.420000 0.000000
25% 223.500000 0.000000 2.000000 NaN 0.000000
50% 446.000000 0.000000 3.000000 NaN 0.000000
75% 668.500000 1.000000 3.000000 NaN 1.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000
Parch Fare
count 891.000000 891.000000
mean 0.381594 32.204208
std 0.806057 49.693429
min 0.000000 0.000000
25% 0.000000 7.910400
50% 0.000000 14.454200
75% 0.000000 31.000000
max 6.000000 512.329200
titanic["Age"]=titanic["Age"].fillna(titanic["Age"].median())
print titanic.describe()
结果:
PassengerId Survived Pclass Age SibSp \
count 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.361582 0.523008
std 257.353842 0.486592 0.836071 13.019697 1.102743
min 1.000000 0.000000 1.000000 0.420000 0.000000
25% 223.500000 0.000000 2.000000 22.000000 0.000000
50% 446.000000 0.000000 3.000000 28.000000 0.000000
75% 668.500000 1.000000 3.000000 35.000000 1.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000
Parch Fare
count 891.000000 891.000000
mean 0.381594 32.204208
std 0.806057 49.693429
min 0.000000 0.000000
25% 0.000000 7.910400
50% 0.000000 14.454200
75% 0.000000 31.000000
max 6.000000 512.329200
print titanic["Sex"].unique()
print titanic["Embarked"].unique()
结果:
['male' 'female']
['S' 'C' 'Q' nan]
2) 然后,将相应字符串的位置附上对应的Int/float值titanic.loc[titanic["Sex"]=="male","Sex"] = 0;
titanic.loc[titanic["Sex"]=="female","Sex"] = 1;
titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0;
titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1;
titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2;
titanic.head()
结果:
def data_proprocess():
import pandas as pa
import numpy as np
import matplotlib.pyplot as plt
filename = "train.csv"
titanic = pa.read_csv(filename)
#titanic.head()
#print titanic.describe()
titanic["Age"]=titanic["Age"].fillna(titanic["Age"].median())
titanic['Embarked'] = titanic['Embarked'].fillna('S')
#print titanic["Sex"].unique()
#print titanic["Embarked"].unique()
titanic.loc[titanic["Sex"]=="male","Sex"] = 0;
titanic.loc[titanic["Sex"]=="female","Sex"] = 1;
titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0;
titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1;
titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2;
#titanic.head()
return titanic
def classify_LinearRegression(titanic):
import pandas as pa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]#特征
alg = LinearRegression()#线性回归
kf = KFold(titanic.shape[0],n_folds=3,random_state=1)#交叉验证集
predictions = []
for train,test in kf:
train_predictors = (titanic[predictors].iloc[train,:])
train_target = titanic["Survived"].iloc[train]
alg.fit(train_predictors,train_target)
test_predictions = alg.predict(titanic[predictors].iloc[test,:])
predictions.append(test_predictions)
predictions = np.concatenate(predictions,axis=0)
predictions[predictions > 0.5] =1
predictions[predictions <= 0.5] =0
accuracy = sum(predictions[predictions == titanic['Survived']])/len(predictions)
return accuracy
def classify_LogisticRegression(titanic):
import pandas as pa
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]#特征
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=3)
return scores.mean()
print "LinearRegression Classification result is :"
print classify_LinearRegression(data_proprocess())
print "LogisticRegression Classification result is :"
print classify_LogisticRegression(data_proprocess())
LinearRegression Classification result is :
0.261503928171
LogisticRegression Classification result is :
0.787878787879
从结果可以看出,还是用逻辑回归做分类问题精度更高。
def classify_RandomForestClassifier(train_data,test_data):
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
import pandas as pa
import numpy as np
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
scores = cross_validation.cross_val_score(clf,train_data[predictors],train_data["Survived"],cv=3)
clf .fit(train_data[predictors],train_data["Survived"])
predict_result= clf.predict(test_data[predictors])
result = pa.DataFrame({'PassengerId':test_data['PassengerId'].as_matrix(), 'Survived':predict_result.astype(np.int32)})
result.to_csv("logistic_regression_predictions.csv", index=False)
return scores.mean()
print "train"
titanic_train=data_proprocess("train.csv")
print "test"
titanic_test=data_proprocess("test.csv")
classify_RandomForestClassifier(titanic_train,titanic_test)