与网上的其他内容均一样
import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
titanic.head()
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
row.names |
pclass |
survived |
name |
age |
embarked |
home.dest |
room |
ticket |
boat |
sex |
0 |
1 |
1st |
1 |
Allen, Miss Elisabeth Walton |
29.0000 |
Southampton |
St Louis, MO |
B-5 |
24160 L221 |
2 |
female |
1 |
2 |
1st |
0 |
Allison, Miss Helen Loraine |
2.0000 |
Southampton |
Montreal, PQ / Chesterville, ON |
C26 |
NaN |
NaN |
female |
2 |
3 |
1st |
0 |
Allison, Mr Hudson Joshua Creighton |
30.0000 |
Southampton |
Montreal, PQ / Chesterville, ON |
C26 |
NaN |
(135) |
male |
3 |
4 |
1st |
0 |
Allison, Mrs Hudson J.C. (Bessie Waldo Daniels) |
25.0000 |
Southampton |
Montreal, PQ / Chesterville, ON |
C26 |
NaN |
NaN |
female |
4 |
5 |
1st |
1 |
Allison, Master Hudson Trevor |
0.9167 |
Southampton |
Montreal, PQ / Chesterville, ON |
C22 |
NaN |
11 |
male |
titanic.info()
X = titanic[['pclass', 'age', 'sex']]
y = titanic['survived']
X.info()
X.head()
X['age'].fillna(X['age'].mean(), inplace=True)
X.info()
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass 1313 non-null object
age 1313 non-null float64
sex 1313 non-null object
dtypes: float64(1), object(2)
memory usage: 30.9+ KB
D:\Program Files\Anaconda35\lib\site-packages\pandas\core\generic.py:3660: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=33)
X_train.head()
X_train.info()
Int64Index: 984 entries, 1086 to 1044
Data columns (total 3 columns):
pclass 984 non-null object
age 984 non-null float64
sex 984 non-null object
dtypes: float64(1), object(2)
memory usage: 30.8+ KB
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X_train = vec.fit_transform(X_train.to_dict(orient = 'record'))
X_test = vec.transform(X_test.to_dict(orient = 'record'))
vec.feature_names_
['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_predict = dtc.predict(X_test)
from sklearn.metrics import classification_report
print(dtc.score(X_test, y_test))
0.781155015198
print(classification_report(y_predict, y_test, target_names=['died', 'survived']))
precision recall f1-score support
died 0.91 0.78 0.84 236
survived 0.58 0.80 0.67 93
avg / total 0.81 0.78 0.79 329