import pandas as pd
import numpy as np
data = pd.DataFrame({
'x0':[1,2,3,4,5],
'x1':[0.01,-0.01,0.25,-4.1,0.],
'x2':[-1.5,0.,3.6,1.3,-2.1]
})
data
|
x0 |
x1 |
x2 |
0 |
1 |
0.01 |
-1.5 |
1 |
2 |
-0.01 |
0.0 |
2 |
3 |
0.25 |
3.6 |
3 |
4 |
-4.10 |
1.3 |
4 |
5 |
0.00 |
-2.1 |
data.columns
Index(['x0', 'x1', 'x2'], dtype='object')
data.values
array([[ 1. , 0.01, -1.5 ],
[ 2. , -0.01, 0. ],
[ 3. , 0.25, 3.6 ],
[ 4. , -4.1 , 1.3 ],
[ 5. , 0. , -2.1 ]])
df2 = pd.DataFrame(data.values,columns=['one','two','three'])
df2
|
one |
two |
three |
0 |
1.0 |
0.01 |
-1.5 |
1 |
2.0 |
-0.01 |
0.0 |
2 |
3.0 |
0.25 |
3.6 |
3 |
4.0 |
-4.10 |
1.3 |
4 |
5.0 |
0.00 |
-2.1 |
df3 = data.copy()
df3['strings'] = ['a','b','c','d','e']
df3
|
x0 |
x1 |
x2 |
strings |
0 |
1 |
0.01 |
-1.5 |
a |
1 |
2 |
-0.01 |
0.0 |
b |
2 |
3 |
0.25 |
3.6 |
c |
3 |
4 |
-4.10 |
1.3 |
d |
4 |
5 |
0.00 |
-2.1 |
e |
model_cols = ['x0','x1']
data.loc[:,model_cols].values
array([[ 1. , 0.01],
[ 2. , -0.01],
[ 3. , 0.25],
[ 4. , -4.1 ],
[ 5. , 0. ]])
data['category'] = pd.Categorical(['a','b','a','a','b'],
categories=['a','b'])
data
|
x0 |
x1 |
x2 |
category |
0 |
1 |
0.01 |
-1.5 |
a |
1 |
2 |
-0.01 |
0.0 |
b |
2 |
3 |
0.25 |
3.6 |
a |
3 |
4 |
-4.10 |
1.3 |
a |
4 |
5 |
0.00 |
-2.1 |
b |
dummies = pd.get_dummies(data.category,prefix='category')
data_with_dummies = data.drop('category',axis=1).join(dummies)
data_with_dummies
|
x0 |
x1 |
x2 |
category_a |
category_b |
0 |
1 |
0.01 |
-1.5 |
1 |
0 |
1 |
2 |
-0.01 |
0.0 |
0 |
1 |
2 |
3 |
0.25 |
3.6 |
1 |
0 |
3 |
4 |
-4.10 |
1.3 |
1 |
0 |
4 |
5 |
0.00 |
-2.1 |
0 |
1 |
data = pd.DataFrame({
'x0': [1, 2, 3, 4, 5],
'x1': [0.01, -0.01, 0.25, -4.1, 0.],
'y': [-1.5, 0., 3.6, 1.3, -2.]})
data
|
x0 |
x1 |
y |
0 |
1 |
0.01 |
-1.5 |
1 |
2 |
-0.01 |
0.0 |
2 |
3 |
0.25 |
3.6 |
3 |
4 |
-4.10 |
1.3 |
4 |
5 |
0.00 |
-2.0 |
import patsy
y,x = patsy.dmatrices('y ~ x0 + x1',data)
y
DesignMatrix with shape (5, 1)
y
-1.5
0.0
3.6
1.3
-2.0
Terms:
'y' (column 0)
x
DesignMatrix with shape (5, 3)
Intercept x0 x1
1 1 0.01
1 2 -0.01
1 3 0.25
1 4 -4.10
1 5 0.00
Terms:
'Intercept' (column 0)
'x0' (column 1)
'x1' (column 2)
coef, resid, _, _ = np.linalg.lstsq(x, y)
C:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
"""Entry point for launching an IPython kernel.
coef
array([[ 0.31290976],
[-0.07910564],
[-0.26546384]])
y,x = patsy.dmatrices('y~x0 + np.log(np.abs(x1) + 1)',data)
x
DesignMatrix with shape (5, 3)
Intercept x0 np.log(np.abs(x1) + 1)
1 1 0.00995
1 2 0.00995
1 3 0.22314
1 4 1.62924
1 5 0.00000
Terms:
'Intercept' (column 0)
'x0' (column 1)
'np.log(np.abs(x1) + 1)' (column 2)
y, x = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)
x
DesignMatrix with shape (5, 3)
Intercept standardize(x0) center(x1)
1 -1.41421 0.78
1 -0.70711 0.76
1 0.00000 1.02
1 0.70711 -3.33
1 1.41421 0.77
Terms:
'Intercept' (column 0)
'standardize(x0)' (column 1)
'center(x1)' (column 2)
data = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],
'key2': [0, 1, 0, 1, 0, 1, 0, 0],
'v1': [1, 2, 3, 4, 5, 6, 7, 8],
'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]
})
y,x = patsy.dmatrices('v2~key1',data)
x
DesignMatrix with shape (8, 2)
Intercept key1[T.b]
1 0
1 0
1 1
1 1
1 0
1 1
1 0
1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
y,x = patsy.dmatrices('v2~key1 + 0',data)
x
DesignMatrix with shape (8, 2)
key1[a] key1[b]
1 0
1 0
0 1
0 1
1 0
0 1
1 0
0 1
Terms:
'key1' (columns 0:2)
y,x = patsy.dmatrices('v2~C(key2)',data)
x
DesignMatrix with shape (8, 2)
Intercept C(key2)[T.1]
1 0
1 1
1 0
1 1
1 0
1 1
1 0
1 0
Terms:
'Intercept' (column 0)
'C(key2)' (column 1)
data['key2'] = data['key2'].map({0:'zero',1:'one'})
data
|
key1 |
key2 |
v1 |
v2 |
0 |
a |
zero |
1 |
-1.0 |
1 |
a |
one |
2 |
0.0 |
2 |
b |
zero |
3 |
2.5 |
3 |
b |
one |
4 |
-0.5 |
4 |
a |
zero |
5 |
4.0 |
5 |
b |
one |
6 |
-1.2 |
6 |
a |
zero |
7 |
0.2 |
7 |
b |
zero |
8 |
-1.7 |
y,x = patsy.dmatrices('v2~key1+key2',data)
x
DesignMatrix with shape (8, 3)
Intercept key1[T.b] key2[T.zero]
1 0 1
1 0 0
1 1 1
1 1 0
1 0 1
1 1 0
1 0 1
1 1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
'key2' (column 2)
y,x = patsy.dmatrices('v2~key1+key2+key1:key2',data)
x
DesignMatrix with shape (8, 4)
Intercept key1[T.b] key2[T.zero] key1[T.b]:key2[T.zero]
1 0 1 0
1 0 0 0
1 1 1 1
1 1 0 0
1 0 1 0
1 1 0 0
1 0 1 0
1 1 1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
'key2' (column 2)
'key1:key2' (column 3)
import statsmodels.api as sm
import statsmodels.formula.api as smf
def dnorm(mean,variance,size=1):
if isinstance(size,int):
size=size,
return mean + np.sqrt(variance)*np.random.randn(*size)
np.random.seed(12345)
N = 100
x = np.c_[dnorm(0,0.4,size=N),
dnorm(0,0.6,size=N),
dnorm(0,0.2,size=N)]
eps = dnorm(0,0.1,size=N)
beta = [0.1,0.3,0.5]
y = np.dot(x,beta) + eps
x[:5]
array([[-0.12946849, -1.21275292, 0.50422488],
[ 0.30291036, -0.43574176, -0.25417986],
[-0.32852189, -0.02530153, 0.13835097],
[-0.35147471, -0.71960511, -0.25821463],
[ 1.2432688 , -0.37379916, -0.52262905]])
x_model = sm.add_constant(x)
x_model[:5]
array([[ 1. , -0.12946849, -1.21275292, 0.50422488],
[ 1. , 0.30291036, -0.43574176, -0.25417986],
[ 1. , -0.32852189, -0.02530153, 0.13835097],
[ 1. , -0.35147471, -0.71960511, -0.25821463],
[ 1. , 1.2432688 , -0.37379916, -0.52262905]])
model = sm.OLS(y,x)
results = model.fit()
results.params
array([0.17826108, 0.22303962, 0.50095093])
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.430
Model: OLS Adj. R-squared: 0.413
Method: Least Squares F-statistic: 24.42
Date: Fri, 09 Nov 2018 Prob (F-statistic): 7.44e-12
Time: 11:17:59 Log-Likelihood: -34.305
No. Observations: 100 AIC: 74.61
Df Residuals: 97 BIC: 82.42
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
x1 0.1783 0.053 3.364 0.001 0.073 0.283
x2 0.2230 0.046 4.818 0.000 0.131 0.315
x3 0.5010 0.080 6.237 0.000 0.342 0.660
==============================================================================
Omnibus: 4.662 Durbin-Watson: 2.201
Prob(Omnibus): 0.097 Jarque-Bera (JB): 4.098
Skew: 0.481 Prob(JB): 0.129
Kurtosis: 3.243 Cond. No. 1.74
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])
data['y'] = y
data[:5]
|
col0 |
col1 |
col2 |
y |
0 |
-0.129468 |
-1.212753 |
0.504225 |
0.427863 |
1 |
0.302910 |
-0.435742 |
-0.254180 |
-0.673480 |
2 |
-0.328522 |
-0.025302 |
0.138351 |
-0.090878 |
3 |
-0.351475 |
-0.719605 |
-0.258215 |
-0.489494 |
4 |
1.243269 |
-0.373799 |
-0.522629 |
-0.128941 |
results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()
results.params
Intercept 0.033559
col0 0.176149
col1 0.224826
col2 0.514808
dtype: float64
train = pd.read_csv('datasets/titanic/train.csv')
test = pd.read_csv('datasets/titanic/test.csv')
train[:4]
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
train.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
test.isnull().sum()
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
impute_value = train['Age'].median()
train['Age'] = train['Age'].fillna(impute_value)
test['Age'] = test['Age'].fillna(impute_value)
train['IsFemale'] = (train['Sex'] =='female').astype(int)
test['IsFemale'] = (test['Sex'] == 'female').astype(int)
predictors = ['Pclass','IsFemale','Age']
X_train = train[predictors].values
X_test = test[predictors].values
Y_train = train['Survived'].values
X_train
array([[ 3., 0., 22.],
[ 1., 1., 38.],
[ 3., 1., 26.],
...,
[ 3., 1., 28.],
[ 1., 0., 26.],
[ 3., 0., 32.]])
Y_train
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,Y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
y_predict = model.predict(X_test)
y_predict
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0],
dtype=int64)
from sklearn.linear_model import LogisticRegressionCV
model_cv = LogisticRegressionCV(10)
model_cv.fit(X_train, Y_train)
LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
fit_intercept=True, intercept_scaling=1.0, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)