Pandas数据处理4:高性能计算eval()和query()

import pandas as pd 
import numpy as np
#数据透视表(pivot table)
import seaborn as sns 
titanic = sns.load_dataset('titanic')
titanic.head()
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
#分组:例如这样统计不同性别乘客的生还率
titanic.groupby('sex')[['survived']].mean()
survived
sex
female 0.742038
male 0.188908
#更复杂的情况
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
class First Second Third
sex
female 0.968085 0.921053 0.500000
male 0.368852 0.157407 0.135447
#用 DataFrame 的 pivot_table 实现的效果等同于上面
titanic.pivot_table('survived', index='sex', columns='class')
class First Second Third
sex
female 0.968085 0.921053 0.500000
male 0.368852 0.157407 0.135447
#如果想把年龄('age')也加进去作为第三个维度,这就可以通过 pd.cut 函数将年龄进行分段:
age = pd.cut(titanic['age'], [0, 18, 80]) 
titanic.pivot_table('survived', ['sex', age], 'class')
class First Second Third
sex age
female (0, 18] 0.909091 1.000000 0.511628
(18, 80] 0.972973 0.900000 0.423729
male (0, 18] 0.800000 0.600000 0.215686
(18, 80] 0.375000 0.071429 0.133663
#qcut分为两份
fare = pd.qcut(titanic['fare'], 2) 
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])
fare (-0.001, 14.454] (14.454, 512.329]
class First Second Third First Second Third
sex age
female (0, 18] NaN 1.000000 0.714286 0.909091 1.000000 0.318182
(18, 80] NaN 0.880000 0.444444 0.972973 0.914286 0.391304
male (0, 18] NaN 0.000000 0.260870 0.800000 0.818182 0.178571
(18, 80] 0.0 0.098039 0.125000 0.391304 0.030303 0.192308
#向量化字符串
data = ['peter', 'Paul', 'MARY', 'gUIDO']
names = pd.Series(data) 
names
0    peter
1     Paul
2     MARY
3    gUIDO
dtype: object
names.str.capitalize() #大写,缺失值自动跳过
0    Peter
1     Paul
2     Mary
3    Guido
dtype: object
#由于 Numexpr 在计算代数式时不需要为临时数组分配全部内存,因此计算比 NumPy 更高效,尤其适合处理大型数组。马上要介绍的 Pandas 的 eval() 和 query()工具其实也是基于 Numexpr 实现的。
import pandas as pd 
nrows, ncols = 100000, 100 
rng = np.random.RandomState(42) 
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))
print(df1.shape)
(100000, 100)
%timeit df1 + df2 + df3 + df4
%timeit pd.eval('df1 + df2 + df3 + df4')
#这个 eval() 版本的代数式比普通方法快一倍(而且内存消耗更少),结果也是一样的
66.5 ms ± 924 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
30.3 ms ± 599 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
 df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C']) 
 df.head()
A B C
0 0.615875 0.525167 0.047354
1 0.330858 0.412879 0.441564
2 0.689047 0.559068 0.230350
3 0.290486 0.695479 0.852587
4 0.424280 0.534344 0.245216
#用DataFrame.eval()实现列间运算
#使用 dataframe.eval() 方法的好处是可以借助 列名称 进行运算
result1 = (df['A'] + df['B']) / (df['C'] - 1) 
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
result3 = df.eval('(A + B) / (C - 1)') 
print(result3)
np.allclose(result1, result3)
0     -1.197761
1     -1.331822
2     -1.621667
3     -6.688481
4     -1.270064
         ...   
995   -3.349773
996   -2.163240
997   -0.936554
998   -2.263292
999   -3.781258
Length: 1000, dtype: float64





True
#用df.eval创建新的列
df.eval('D = (A + B) / C', inplace=True) 
df.head()
A B C D
0 0.615875 0.525167 0.047354 24.095868
1 0.330858 0.412879 0.441564 1.684325
2 0.689047 0.559068 0.230350 5.418335
3 0.290486 0.695479 0.852587 1.156439
4 0.424280 0.534344 0.245216 3.909296
#DataFrame.eval() 方法还支持通过 @ 符号使用 Python 的局部变量
column_mean = df.mean(1) 
result1 = df['A'] + column_mean 
result2 = df.eval('A + @column_mean') 
np.allclose(result1, result2)
True
#query
result1 = df[(df.A < 0.5) & (df.B < 0.5)] 
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]') 
np.allclose(result1, result2)
True
#和前面介绍过的 DataFrame.eval() 一样,这是一个用 DataFrame 列创建的代数式,但是不能用 DataFrame.eval() 语法 ,因为你要的结果是包含 DataFrame 的全部列。
result2 = df.query('A < 0.5 and B < 0.5') 
np.allclose(result1, result2)
True
#query() 方法也支持用 @ 符号引用局部变量
Cmean = df['C'].mean() 
result1 = df[(df.A < Cmean) & (df.B < Cmean)] 
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)
True

你可能感兴趣的:(python数据处理手册,数据库,机器学习)