import pandas as pd
import numpy as np
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
|
survived |
pclass |
sex |
age |
sibsp |
parch |
fare |
embarked |
class |
who |
adult_male |
deck |
embark_town |
alive |
alone |
0 |
0 |
3 |
male |
22.0 |
1 |
0 |
7.2500 |
S |
Third |
man |
True |
NaN |
Southampton |
no |
False |
1 |
1 |
1 |
female |
38.0 |
1 |
0 |
71.2833 |
C |
First |
woman |
False |
C |
Cherbourg |
yes |
False |
2 |
1 |
3 |
female |
26.0 |
0 |
0 |
7.9250 |
S |
Third |
woman |
False |
NaN |
Southampton |
yes |
True |
3 |
1 |
1 |
female |
35.0 |
1 |
0 |
53.1000 |
S |
First |
woman |
False |
C |
Southampton |
yes |
False |
4 |
0 |
3 |
male |
35.0 |
0 |
0 |
8.0500 |
S |
Third |
man |
True |
NaN |
Southampton |
no |
True |
titanic.groupby('sex')[['survived']].mean()
|
survived |
sex |
|
female |
0.742038 |
male |
0.188908 |
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
class |
First |
Second |
Third |
sex |
|
|
|
female |
0.968085 |
0.921053 |
0.500000 |
male |
0.368852 |
0.157407 |
0.135447 |
titanic.pivot_table('survived', index='sex', columns='class')
class |
First |
Second |
Third |
sex |
|
|
|
female |
0.968085 |
0.921053 |
0.500000 |
male |
0.368852 |
0.157407 |
0.135447 |
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')
|
class |
First |
Second |
Third |
sex |
age |
|
|
|
female |
(0, 18] |
0.909091 |
1.000000 |
0.511628 |
(18, 80] |
0.972973 |
0.900000 |
0.423729 |
male |
(0, 18] |
0.800000 |
0.600000 |
0.215686 |
(18, 80] |
0.375000 |
0.071429 |
0.133663 |
fare = pd.qcut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])
|
fare |
(-0.001, 14.454] |
(14.454, 512.329] |
|
class |
First |
Second |
Third |
First |
Second |
Third |
sex |
age |
|
|
|
|
|
|
female |
(0, 18] |
NaN |
1.000000 |
0.714286 |
0.909091 |
1.000000 |
0.318182 |
(18, 80] |
NaN |
0.880000 |
0.444444 |
0.972973 |
0.914286 |
0.391304 |
male |
(0, 18] |
NaN |
0.000000 |
0.260870 |
0.800000 |
0.818182 |
0.178571 |
(18, 80] |
0.0 |
0.098039 |
0.125000 |
0.391304 |
0.030303 |
0.192308 |
data = ['peter', 'Paul', 'MARY', 'gUIDO']
names = pd.Series(data)
names
0 peter
1 Paul
2 MARY
3 gUIDO
dtype: object
names.str.capitalize()
0 Peter
1 Paul
2 Mary
3 Guido
dtype: object
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))
print(df1.shape)
(100000, 100)
%timeit df1 + df2 + df3 + df4
%timeit pd.eval('df1 + df2 + df3 + df4')
66.5 ms ± 924 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
30.3 ms ± 599 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()
|
A |
B |
C |
0 |
0.615875 |
0.525167 |
0.047354 |
1 |
0.330858 |
0.412879 |
0.441564 |
2 |
0.689047 |
0.559068 |
0.230350 |
3 |
0.290486 |
0.695479 |
0.852587 |
4 |
0.424280 |
0.534344 |
0.245216 |
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
result3 = df.eval('(A + B) / (C - 1)')
print(result3)
np.allclose(result1, result3)
0 -1.197761
1 -1.331822
2 -1.621667
3 -6.688481
4 -1.270064
...
995 -3.349773
996 -2.163240
997 -0.936554
998 -2.263292
999 -3.781258
Length: 1000, dtype: float64
True
df.eval('D = (A + B) / C', inplace=True)
df.head()
|
A |
B |
C |
D |
0 |
0.615875 |
0.525167 |
0.047354 |
24.095868 |
1 |
0.330858 |
0.412879 |
0.441564 |
1.684325 |
2 |
0.689047 |
0.559068 |
0.230350 |
5.418335 |
3 |
0.290486 |
0.695479 |
0.852587 |
1.156439 |
4 |
0.424280 |
0.534344 |
0.245216 |
3.909296 |
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)
True
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)
True
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)
True
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)
True