Python 源码:
#coding=utf-8
import pandas as pd
#-------------data split
from sklearn.cross_validation import train_test_split
#-------------feature transfer
from sklearn.feature_extraction import DictVectorizer
#-------------
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
#-------------
from sklearn.metrics import classification_report
#-------------download data
titanic=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
print titanic.head()
#transfer to dataFrame format by pandas,use info() to show statistics of data
print titanic.info()
#-------------feature selection
X=titanic[['pclass','age','sex']]
y=titanic['survived']
print 'bf processing\n',X.info()
#-------------feature processing
X['age'].fillna(X['age'].mean(),inplace=True)
print 'af processing\n',X.info
#-------------data split
#75% training set,25% testing set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=33)
#-------------feature transfer from String to int
vec=DictVectorizer(sparse=False)
X_train=vec.fit_transform(X_train.to_dict(orient='record'))
print vec.get_feature_names()
X_test=vec.transform(X_test.to_dict(orient='record'))
#-------------training DT
#initialize
dtc=DecisionTreeClassifier()
dtc.fit(X_train,y_train)
dtc_y_predict=dtc.predict(X_test)
#-------------training RFC
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_y_predict=rfc.predict(X_test)
#-------------training GBC
gbc=GradientBoostingClassifier()
gbc.fit(X_train,y_train)
gbc_y_predict=gbc.predict(X_test)
#-------------performance DT
print 'The Accuracy DT is',dtc.score(X_test,y_test)
print classification_report(y_test,dtc_y_predict,target_names=['died','survived'])
#-------------performance RFC
print 'The Accuracy RFC is',rfc.score(X_test,y_test)
print classification_report(y_test,rfc_y_predict,target_names=['died','survived'])
#-------------performance GBC
print 'The Accuracy GBC is',gbc.score(X_test,y_test)
print classification_report(y_test,gbc_y_predict,target_names=['died','survived'])
Result:
The Accuracy DT is 0.781155015198
precision recall f1-score support
died 0.78 0.91 0.84 202
survived 0.80 0.58 0.67 127
avg / total 0.78 0.78 0.77 329
The Accuracy RFC is 0.784194528875
precision recall f1-score support
died 0.78 0.91 0.84 202
survived 0.80 0.58 0.68 127
avg / total 0.79 0.78 0.78 329
The Accuracy GBC is 0.790273556231
precision recall f1-score support
died 0.78 0.92 0.84 202
survived 0.82 0.58 0.68 127
avg / total 0.80 0.79 0.78 329
基线系统通常指使用经典模型搭建的机器学习系统,研发人员每提出一个新模型,都需要和基线系统在多个具有代表性的数据集上进行性能比较的测试。随机森林分类模型就经常以基线系统的身份出现在科研论文,甚至公开的数据竞赛中。