import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('/Users/gaoliang/Documents/Kaggle/titanic/train.csv')
df.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df.isna().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
# check if the dataset is balanced by counting the unique vslues of the target variable:
df.Survived.value_counts()
0 549
1 342
Name: Survived, dtype: int64
df.info()
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.shape
(891, 12)
# we do not need column PassengerId. We can drop it as follow:
df.drop(columns = ['PassengerId'], inplace = True)
# DataFrame does not change the original date. It simply produces a new copy.
# Alternatively, we can write:
# df = df.drop(columns = ['PassengerId'])
df.head()
Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df.drop(columns = ['Cabin','Ticket','Name'],inplace = True)
# change Fare to Price
df.rename(columns = {'Fare':'Price'},inplace = True)
df.head()
Survived | Pclass | Sex | Age | SibSp | Parch | Price | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S |
# Let's plot a histogram of Fare:
df.Price.hist(bins=100)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QyqhtZW3-1665478037452)(output_11_1.png)]
# we need to replace the values of the column 'Sex':
#Solution 1:
df.loc[df['Sex'] == 'female','Sex'] = 0
df.loc[df['Sex'] == 'male','Sex'] = 1
df.head(5)
Survived | Pclass | Sex | Age | SibSp | Parch | Price | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 | S |
1 | 1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | C |
2 | 1 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | S |
3 | 1 | 1 | 0 | 35.0 | 1 | 0 | 53.1000 | S |
4 | 0 | 3 | 1 | 35.0 | 0 | 0 | 8.0500 | S |
# 2
df2 = df.copy()
def Sex2Num(Sex_String):
if Sex_String == 'female':
return 0
elif Sex_String == 'male':
return 1
else:
return Sex_String
df2['Sex'] = df2['Sex'].apply(Sex2Num)
df2.head(3)
Survived | Pclass | Sex | Age | SibSp | Parch | Price | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 | S |
1 | 1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | C |
2 | 1 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | S |
# 3
df3 = df.copy()
df3['Sex'] = df3['Sex'].apply(lambda x:0 if x == 'female' else 1 if x == 'male' else x)
df3.head(3)
Survived | Pclass | Sex | Age | SibSp | Parch | Price | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 | S |
1 | 1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | C |
2 | 1 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | S |
pandas.get_dummies()
allows us to convert a categorical variable with k possible values into k new binary variables called dummy variables. This conversion is also called one-hot encoding in computer science. Below, we convert column Embarked
into dummies.
df = pd.get_dummies(df, columns = ['Embarked'])
df.head(10)
Survived | Pclass | Sex | Age | SibSp | Parch | Price | Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 | 0 | 0 | 1 |
1 | 1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | 1 | 0 | 0 |
2 | 1 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | 0 | 0 | 1 |
3 | 1 | 1 | 0 | 35.0 | 1 | 0 | 53.1000 | 0 | 0 | 1 |
4 | 0 | 3 | 1 | 35.0 | 0 | 0 | 8.0500 | 0 | 0 | 1 |
5 | 0 | 3 | 1 | NaN | 0 | 0 | 8.4583 | 0 | 1 | 0 |
6 | 0 | 1 | 1 | 54.0 | 0 | 0 | 51.8625 | 0 | 0 | 1 |
7 | 0 | 3 | 1 | 2.0 | 3 | 1 | 21.0750 | 0 | 0 | 1 |
8 | 1 | 3 | 0 | 27.0 | 0 | 2 | 11.1333 | 0 | 0 | 1 |
9 | 1 | 2 | 0 | 14.0 | 1 | 0 | 30.0708 | 1 | 0 | 0 |
We then need to drop one of the created dummies to avoid the multicollinearity problem. Let’s drop the most frequent one, Embarked_S.
df.drop(columns = 'Embarked_S', inplace = True)
df.head()
Survived | Pclass | Sex | Age | SibSp | Parch | Price | Embarked_C | Embarked_Q | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 | 0 | 0 |
1 | 1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | 1 | 0 |
2 | 1 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | 0 | 0 |
3 | 1 | 1 | 0 | 35.0 | 1 | 0 | 53.1000 | 0 | 0 |
4 | 0 | 3 | 1 | 35.0 | 0 | 0 | 8.0500 | 0 | 0 |
Suppose we want to move column Pclass
to after column Parch
. We can do so in two ways.
DataFrame.reindex(columns=[the columns in the order that you want])
df = df[['Survived','Sex','Age','SibSp','Parch','Pclass','Price','Embarked_C','Embarked_Q']]
# hint: we can use df.columns.to_list() to first produce the old column order, then copy & edit
df.head()
Survived | Sex | Age | SibSp | Parch | Pclass | Price | Embarked_C | Embarked_Q | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 22.0 | 1 | 0 | 3 | 7.2500 | 0 | 0 |
1 | 1 | 0 | 38.0 | 1 | 0 | 1 | 71.2833 | 1 | 0 |
2 | 1 | 0 | 26.0 | 0 | 0 | 3 | 7.9250 | 0 | 0 |
3 | 1 | 0 | 35.0 | 1 | 0 | 1 | 53.1000 | 0 | 0 |
4 | 0 | 1 | 35.0 | 0 | 0 | 3 | 8.0500 | 0 | 0 |
# 要把 特征和标签分开, sklearn 是分开导入的
# Separate the data into the feature matrix and the target array
X = df.drop(columns=['Survived'])
y = df['Survived']
# Next, split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, # reserve 20% data for testing
random_state=365)
# (Not required in our class) The following is to avoid the well-known SettingWithCopyWarning
# associated with the problematic implementation of train_test_split()
#X_train = X_train.copy()
# X_test = X_test.copy()
print(X_train.shape)
print(X_test.shape)
(712, 8)
(179, 8)
# Any missing data?
df.isna().sum()
Survived 0
Pclass 0
Sex 0
Age 177
SibSp 0
Parch 0
Price 0
Embarked_C 0
Embarked_Q 0
dtype: int64
# This dataset has missing values in column Age, which we need to impute first
X_train_Age_mean = X_train['Age'].mean()
X_train['Age'] = X_train['Age'].fillna(X_train_Age_mean)
# Verify that there's no more missing values:
X_train.Age.isna().sum()
0
# Important: make sure to do exactly the same data wrangling over the test dataset!
X_test['Age'] = X_test['Age'].fillna(X_train_Age_mean)
df.isna().sum()
Survived 0
Pclass 0
Sex 0
Age 177
SibSp 0
Parch 0
Price 0
Embarked_C 0
Embarked_Q 0
dtype: int64
X_test.Age.isna().sum()
0
# Let's try logistic regression as the learning algorithm
# First, load the package
from sklearn.linear_model import LogisticRegression
# Next, set the hyperparameters of this classifier
clf_lr = LogisticRegression(
penalty='none', # Otherwise regularization will happen (to study later)
max_iter=1000) # The model didn't converge with default 100 iterations
# Next, fit (a.k.a. train) this model over the train dataset
clf_lr.fit(X_train,y_train)
LogisticRegression(max_iter=1000, penalty='none')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, penalty='none')
# Run this code cell to observe the coefficients of the trained model:
coef_lr = pd.DataFrame(clf_lr.coef_[0],index=X_train.columns,columns=['coefficient'])
coef_lr.transpose()
Pclass | Sex | Age | SibSp | Parch | Price | Embarked_C | Embarked_Q | |
---|---|---|---|---|---|---|---|---|
coefficient | -0.995441 | -2.602657 | -0.032009 | -0.346223 | -0.059504 | 0.003378 | 0.246601 | 0.346033 |
One weakness of the scikit-learn package, as compared to R packages, is that it is more into prediction and less into the completeness of statistics reporting.
For example, LogisticRegression does not report the p-value. If you need it, try another package statsmodels as follows:
import statsmodels.api as sm
logit_model=sm.Logit(y_train.astype(float),sm.add_constant(X_train.astype(float)))
result=logit_model.fit()
print(result.summary())
Optimization terminated successfully.
Current function value: 0.452236
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: Survived No. Observations: 712
Model: Logit Df Residuals: 703
Method: MLE Df Model: 8
Date: Tue, 11 Oct 2022 Pseudo R-squ.: 0.3193
Time: 11:24:58 Log-Likelihood: -321.99
converged: True LL-Null: -473.03
Covariance Type: nonrobust LLR p-value: 1.491e-60
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 4.2980 0.577 7.446 0.000 3.167 5.429
Pclass -0.9946 0.157 -6.337 0.000 -1.302 -0.687
Sex -2.6030 0.219 -11.902 0.000 -3.032 -2.174
Age -0.0320 0.008 -3.761 0.000 -0.049 -0.015
SibSp -0.3462 0.123 -2.818 0.005 -0.587 -0.105
Parch -0.0598 0.139 -0.431 0.666 -0.331 0.212
Price 0.0034 0.003 1.247 0.213 -0.002 0.009
Embarked_C 0.2466 0.260 0.947 0.344 -0.264 0.757
Embarked_Q 0.3443 0.376 0.915 0.360 -0.393 1.082
==============================================================================
/Users/gaoliang/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/tsatools.py:142: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only
x = pd.concat(x[::order], 1)
# Now back to LogisticRegression in scikit-learn. Let's evaluate the performance of the
# trained model. To do so, we first use the trained model to predict the test dataset.
y_predict = clf_lr.predict(X_test)
# Then, compare the predicted values with the truth to get accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict).round(4)
0.8156
# Observe the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_predict))
[[96 12]
[21 50]]
When it comes to creating a trained algorithm (a.k.a., a trained model, or model), we know there are many possible choices:
There is no free lunch – there is no sure choice that dominates all other choices. Otherwise, we wouldn’t see all these choices in today’s analytics practices.
In this lecture, we will try and discuss the pros/cons of a few popular learning algorithms. We leave the topic of hyperparameter tuning, as well as the discussion of the state-of-the-art boosting-based algorithms (that always require hyperparameter tuning), to the next lecture.
# A template for implementing various supervised learning algorithms
# I assume that, prior to running this code, we have already pre-processed the data
# Load the learning algorithm
from sklearn.linear_model import LogisticRegression
# Set the hyperparameters of this algorithm
clf = LogisticRegression(penalty='none', max_iter=1000)
# Fit the model over the train data
clf.fit(X_train,y_train)
# Use the fitted model to predict the test data
y_predict = clf.predict(X_test)
# Obtain performance metrics
accuracy = accuracy_score(y_test, y_predict).round(4)
print(f"The accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)
# Save the model and the performance metrics for later comparison.
# Here I use suffix "lr" because we just tried logistic regression.
# Change the suffix when you switch to a new learning algorithm!
clf_lr = clf
accuracy_lr = accuracy
cm_lr = cm
The accuracy is: 81.56%
The confusion matrix is:
[[96 12]
[21 50]]
# k-Nearest Neighbors (kNN)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train,y_train)
y_predict = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict).round(4)
print(f"The accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)
# save the results for later comparison
clf_knn = clf
accuracy_knn = accuracy
cm_knn = cm
The accuracy is: 70.95%
The confusion matrix is:
[[92 16]
[36 35]]
# Decision Trees
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train,y_train)
y_predict = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict).round(4)
print(f"The accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)
# save the results for later comparison
clf_dt = clf
accuracy_dt = accuracy
cm_dt = cm
The accuracy is: 77.65%
The confusion matrix is:
[[104 4]
[ 36 35]]
One advantage of decision tree learning is that the trained model is often intuitive to human beings. Therefore, despite its often inferior performance especially for large and complicated datasets, analysts use it a lot in practice for understanding the data and for communication with others. Let’s plot the trained tree I just got.
from sklearn import tree
import matplotlib.pyplot as plt
# warning: if the tree is too big to read, limit the max_depth of the tree during training
plt.figure(figsize=(15,10)) # set plot size (denoted in inches)
tree.plot_tree(clf_dt,
feature_names=X_train.columns,
filled = True,
fontsize=12)
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GbHXlTTi-1665478037454)(output_47_0.png)]
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train,y_train)
y_predict = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict).round(4)
print(f"The accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)
# save the results for later comparison
clf_rf = clf
accuracy_rf = accuracy
cm_rf = cm
The accuracy is: 79.33%
The confusion matrix is:
[[107 1]
[ 36 35]]
A handy feature of RandomForestClassifier is that it provides a robust ranking of the relative importance of all input variables.
importances = clf_rf.feature_importances_
pd.Series(importances, index=X_train.columns).sort_values(ascending=False)
Sex 0.322660
Price 0.215141
Pclass 0.201329
Age 0.122027
SibSp 0.070367
Parch 0.031325
Embarked_C 0.028627
Embarked_Q 0.008524
dtype: float64
scikit-learn
packageThe scikit-learn
package contains a large selection of traditional supervised learning algorithms, with excellent documentation and coding examples.
I expect you to be able to use the following learning algorithms:
(Models NOT required for this course) It is a good idea for you to at least read a bit about the following learning algorithms:
You should also be familiar with the concept of regularization that is commonly used in machine learning (see Part 4 of this lecture)