from dfply import data
diamonds = data.diamonds
import pandas as pd
print(diamonds.loc[:,['cut','price','x']].head(3))
print('------------------------------------------')
print(diamonds[['cut','price','x']].head(3))
print(diamonds.iloc[:,[1,6,7]].head(3))
cut price x
0 Ideal 326 3.95
1 Premium 326 3.89
2 Good 327 4.05
------------------------------------------
cut price x
0 Ideal 326 3.95
1 Premium 326 3.89
2 Good 327 4.05
cut price x
0 Ideal 326 3.95
1 Premium 326 3.89
2 Good 327 4.05
diamonds.iloc[[10,15],:].head(3)
|
carat |
cut |
color |
clarity |
depth |
table |
price |
x |
y |
z |
10 |
0.30 |
Good |
J |
SI1 |
64.0 |
55.0 |
339 |
4.25 |
4.28 |
2.73 |
15 |
0.32 |
Premium |
E |
I1 |
60.9 |
58.0 |
345 |
4.38 |
4.42 |
2.68 |
print(diamonds.iloc[[10,15],:].head(3))
carat cut color clarity depth table price x y z
10 0.30 Good J SI1 64.0 55.0 339 4.25 4.28 2.73
15 0.32 Premium E I1 60.9 58.0 345 4.38 4.42 2.68
from dfply import *
diamonds >> drop(1, X.price, ['x', 'y']) >> head(2)
|
carat |
color |
clarity |
depth |
table |
z |
0 |
0.23 |
E |
SI2 |
61.5 |
55.0 |
2.43 |
1 |
0.21 |
E |
SI1 |
59.8 |
61.0 |
2.31 |
(diamonds >>
select(X.carat, X.cut, X.color) >>
drop(X.cut) >>
head(3))
|
carat |
color |
0 |
0.23 |
E |
1 |
0.21 |
E |
2 |
0.23 |
E |
diamonds.head()
|
carat |
cut |
color |
clarity |
depth |
table |
price |
x |
y |
z |
0 |
0.23 |
Ideal |
E |
SI2 |
61.5 |
55.0 |
326 |
3.95 |
3.98 |
2.43 |
1 |
0.21 |
Premium |
E |
SI1 |
59.8 |
61.0 |
326 |
3.89 |
3.84 |
2.31 |
2 |
0.23 |
Good |
E |
VS1 |
56.9 |
65.0 |
327 |
4.05 |
4.07 |
2.31 |
3 |
0.29 |
Premium |
I |
VS2 |
62.4 |
58.0 |
334 |
4.20 |
4.23 |
2.63 |
4 |
0.31 |
Good |
J |
SI2 |
63.3 |
58.0 |
335 |
4.34 |
4.35 |
2.75 |
diamonds >> drop(1, X.price, ['x', 'y']) >> head(5)
|
carat |
color |
clarity |
depth |
table |
z |
0 |
0.23 |
E |
SI2 |
61.5 |
55.0 |
2.43 |
1 |
0.21 |
E |
SI1 |
59.8 |
61.0 |
2.31 |
2 |
0.23 |
E |
VS1 |
56.9 |
65.0 |
2.31 |
3 |
0.29 |
I |
VS2 |
62.4 |
58.0 |
2.63 |
4 |
0.31 |
J |
SI2 |
63.3 |
58.0 |
2.75 |
(diamonds >>
select(X.carat, X.cut, X.color) >>
select(~X.cut) >>
head(3))
|
carat |
color |
0 |
0.23 |
E |
1 |
0.21 |
E |
2 |
0.23 |
E |
df = (diamonds >>
select(X.carat, X.cut, X.price) >>
mask(X.carat > 0.21, X.cut=='Fair', X.price < 563))
df
|
carat |
cut |
price |
8 |
0.22 |
Fair |
337 |
10030 |
0.30 |
Fair |
416 |
28270 |
0.25 |
Fair |
361 |
31611 |
0.23 |
Fair |
369 |
31615 |
0.27 |
Fair |
371 |
40591 |
0.30 |
Fair |
496 |
40597 |
0.34 |
Fair |
497 |
46961 |
0.37 |
Fair |
527 |
48630 |
0.30 |
Fair |
536 |
df = (diamonds >>
select(X.carat, X.cut, X.price) >>
mask(X.carat > 0.21, X.cut=='Fair', X.price < 563) >>
arrange(X.carat,X.price))
df
|
carat |
cut |
price |
8 |
0.22 |
Fair |
337 |
31611 |
0.23 |
Fair |
369 |
28270 |
0.25 |
Fair |
361 |
31615 |
0.27 |
Fair |
371 |
10030 |
0.30 |
Fair |
416 |
40591 |
0.30 |
Fair |
496 |
48630 |
0.30 |
Fair |
536 |
40597 |
0.34 |
Fair |
497 |
46961 |
0.37 |
Fair |
527 |
(diamonds >>
select(starts_with('c')) >>
head(2))
|
carat |
cut |
color |
clarity |
0 |
0.23 |
Ideal |
E |
SI2 |
1 |
0.21 |
Premium |
E |
SI1 |
diamonds >> select(~starts_with('c')) >> head(2)
|
depth |
table |
price |
x |
y |
z |
0 |
61.5 |
55.0 |
326 |
3.95 |
3.98 |
2.43 |
1 |
59.8 |
61.0 |
326 |
3.89 |
3.84 |
2.31 |
(diamonds >>
select(contains('c')) >>
head(2))
|
carat |
cut |
color |
clarity |
price |
0 |
0.23 |
Ideal |
E |
SI2 |
326 |
1 |
0.21 |
Premium |
E |
SI1 |
326 |
diamonds >>head(2)
|
carat |
cut |
color |
clarity |
depth |
table |
price |
x |
y |
z |
0 |
0.23 |
Ideal |
E |
SI2 |
61.5 |
55.0 |
326 |
3.95 |
3.98 |
2.43 |
1 |
0.21 |
Premium |
E |
SI1 |
59.8 |
61.0 |
326 |
3.89 |
3.84 |
2.31 |
diamonds >> select(columns_to(X.price)) >> head(2)
|
carat |
cut |
color |
clarity |
depth |
table |
0 |
0.23 |
Ideal |
E |
SI2 |
61.5 |
55.0 |
1 |
0.21 |
Premium |
E |
SI1 |
59.8 |
61.0 |
diamonds >> select(columns_from(X.table)) >> head(2)
|
table |
price |
x |
y |
z |
0 |
55.0 |
326 |
3.95 |
3.98 |
2.43 |
1 |
61.0 |
326 |
3.89 |
3.84 |
2.31 |
diamonds >> select(columns_between(X.carat,X.table)) >> head(2)
|
carat |
cut |
color |
clarity |
depth |
table |
0 |
0.23 |
Ideal |
E |
SI2 |
61.5 |
55.0 |
1 |
0.21 |
Premium |
E |
SI1 |
59.8 |
61.0 |
diamonds >> row_slice([10,15])
|
carat |
cut |
color |
clarity |
depth |
table |
price |
x |
y |
z |
10 |
0.30 |
Good |
J |
SI1 |
64.0 |
55.0 |
339 |
4.25 |
4.28 |
2.73 |
15 |
0.32 |
Premium |
E |
I1 |
60.9 |
58.0 |
345 |
4.38 |
4.42 |
2.68 |
diamonds >> sample(frac=0.0001, replace=False)
|
carat |
cut |
color |
clarity |
depth |
table |
price |
x |
y |
z |
11849 |
1.04 |
Premium |
H |
VVS1 |
60.4 |
58.0 |
5102 |
6.58 |
6.53 |
3.96 |
22288 |
1.66 |
Ideal |
F |
SI2 |
62.6 |
59.0 |
10338 |
7.53 |
7.57 |
4.73 |
13016 |
1.20 |
Very Good |
I |
SI1 |
62.2 |
55.0 |
5408 |
6.75 |
6.92 |
4.25 |
2710 |
0.31 |
Ideal |
G |
SI1 |
61.6 |
55.0 |
563 |
4.34 |
4.37 |
2.68 |
25448 |
1.50 |
Premium |
F |
VS2 |
61.1 |
58.0 |
14199 |
7.36 |
7.33 |
4.49 |
diamonds >> sample(n=3, replace=True)
|
carat |
cut |
color |
clarity |
depth |
table |
price |
x |
y |
z |
13775 |
1.70 |
Fair |
D |
I1 |
64.7 |
56.0 |
5617 |
7.46 |
7.37 |
4.80 |
6043 |
0.32 |
Ideal |
E |
SI2 |
62.0 |
55.0 |
576 |
4.41 |
4.36 |
2.72 |
33208 |
0.33 |
Ideal |
H |
IF |
61.6 |
55.0 |
820 |
4.44 |
4.47 |
2.74 |
diamonds >> distinct(X.color)
|
carat |
cut |
color |
clarity |
depth |
table |
price |
x |
y |
z |
0 |
0.23 |
Ideal |
E |
SI2 |
61.5 |
55.0 |
326 |
3.95 |
3.98 |
2.43 |
3 |
0.29 |
Premium |
I |
VS2 |
62.4 |
58.0 |
334 |
4.20 |
4.23 |
2.63 |
4 |
0.31 |
Good |
J |
SI2 |
63.3 |
58.0 |
335 |
4.34 |
4.35 |
2.75 |
7 |
0.26 |
Very Good |
H |
SI1 |
61.9 |
55.0 |
337 |
4.07 |
4.11 |
2.53 |
12 |
0.22 |
Premium |
F |
SI1 |
60.4 |
61.0 |
342 |
3.88 |
3.84 |
2.33 |
25 |
0.23 |
Very Good |
G |
VVS2 |
60.4 |
58.0 |
354 |
3.97 |
4.01 |
2.41 |
28 |
0.23 |
Very Good |
D |
VS2 |
60.5 |
61.0 |
357 |
3.96 |
3.97 |
2.40 |
(diamonds
>> filter_by(X.cut == 'Ideal', X.color == 'E', X.table < 55, X.price < 500)
>> pull('carat'))
c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\subset.py:87: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
return df.ix[:, column]
26683 0.33
32297 0.34
40928 0.30
50623 0.30
50625 0.30
Name: carat, dtype: float64
diamonds >> mutate(x_plus_y=X.x + X.y) >> select(columns_from('x')) >> head(3)
|
x |
y |
z |
x_plus_y |
0 |
3.95 |
3.98 |
2.43 |
7.93 |
1 |
3.89 |
3.84 |
2.31 |
7.73 |
2 |
4.05 |
4.07 |
2.31 |
8.12 |
diamonds >> mutate(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> select(columns_from('x')) >> head(3)
|
x |
y |
z |
x_plus_y |
y_div_z |
0 |
3.95 |
3.98 |
2.43 |
7.93 |
1.637860 |
1 |
3.89 |
3.84 |
2.31 |
7.73 |
1.662338 |
2 |
4.05 |
4.07 |
2.31 |
8.12 |
1.761905 |
diamonds >> transmute(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> head(3)
|
x_plus_y |
y_div_z |
0 |
7.93 |
1.637860 |
1 |
7.73 |
1.662338 |
2 |
8.12 |
1.761905 |
diamonds >> arrange(X.table, ascending=False) >> head(5)
|
carat |
cut |
color |
clarity |
depth |
table |
price |
x |
y |
z |
24932 |
2.01 |
Fair |
F |
SI1 |
58.6 |
95.0 |
13387 |
8.32 |
8.31 |
4.87 |
50773 |
0.81 |
Fair |
F |
SI2 |
68.8 |
79.0 |
2301 |
5.26 |
5.20 |
3.58 |
51342 |
0.79 |
Fair |
G |
SI1 |
65.3 |
76.0 |
2362 |
5.52 |
5.13 |
3.35 |
52860 |
0.50 |
Fair |
E |
VS2 |
79.0 |
73.0 |
2579 |
5.21 |
5.18 |
4.09 |
49375 |
0.70 |
Fair |
H |
VS1 |
62.0 |
73.0 |
2100 |
5.65 |
5.54 |
3.47 |
(diamonds >> group_by(X.cut) >> arrange(X.price) >>
head(3) >> ungroup() >> mask(X.carat < 0.23))
|
carat |
cut |
color |
clarity |
depth |
table |
price |
x |
y |
z |
8 |
0.22 |
Fair |
E |
VS2 |
65.1 |
61.0 |
337 |
3.87 |
3.78 |
2.49 |
1 |
0.21 |
Premium |
E |
SI1 |
59.8 |
61.0 |
326 |
3.89 |
3.84 |
2.31 |
12 |
0.22 |
Premium |
F |
SI1 |
60.4 |
61.0 |
342 |
3.88 |
3.84 |
2.33 |
diamonds >> rename(CUT=X.cut, COLOR='color') >> head(2)
|
carat |
CUT |
COLOR |
clarity |
depth |
table |
price |
x |
y |
z |
0 |
0.23 |
Ideal |
E |
SI2 |
61.5 |
55.0 |
326 |
3.95 |
3.98 |
2.43 |
1 |
0.21 |
Premium |
E |
SI1 |
59.8 |
61.0 |
326 |
3.89 |
3.84 |
2.31 |
diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z']) >> head(5)
|
carat |
cut |
color |
clarity |
table |
variable |
value |
0 |
0.23 |
Ideal |
E |
SI2 |
55.0 |
price |
326.0 |
1 |
0.21 |
Premium |
E |
SI1 |
61.0 |
price |
326.0 |
2 |
0.23 |
Good |
E |
VS1 |
65.0 |
price |
327.0 |
3 |
0.29 |
Premium |
I |
VS2 |
58.0 |
price |
334.0 |
4 |
0.31 |
Good |
J |
SI2 |
58.0 |
price |
335.0 |
diamonds >> gather('variable', 'value') >> head(5)
|
variable |
value |
0 |
carat |
0.23 |
1 |
carat |
0.21 |
2 |
carat |
0.23 |
3 |
carat |
0.29 |
4 |
carat |
0.31 |
,,,
print(d)
a b c
0 1 a True
1 2 b False
2 3 c NaN
d >> unite('united', X.a, 'b', 2, remove=False, na_action='maintain')
a b c united
0 1 a True 1_a_True
1 2 b False 2_b_False
2 3 c NaN NaN
d >> unite('united', ['a','b','c'], remove=True, na_action='ignore', sep='*')
united
0 1*a*True
1 2*b*False
2 3*c
d >> unite('united', d.columns, remove=True, na_action='as_string')
united
0 1_a_True
1 2_b_False
2 3_c_nan
''''''
File "", line 4
a b c
^
SyntaxError: invalid syntax
a = pd.DataFrame({
'x1':['A','B','C'],
'x2':[1,2,3]
})
b = pd.DataFrame({
'x1':['A','B','D'],
'x3':[True,False,True]
})
a
b
|
x1 |
x3 |
0 |
A |
True |
1 |
B |
False |
2 |
D |
True |
a >> inner_join(b, by='x1')
|
x1 |
x2 |
x3 |
0 |
A |
1 |
True |
1 |
B |
2 |
False |
a >> outer_join(b, by='x1')
|
x1 |
x2 |
x3 |
0 |
A |
1.0 |
True |
1 |
B |
2.0 |
False |
2 |
C |
3.0 |
NaN |
3 |
D |
NaN |
True |
a >> left_join(b, by='x1')
|
x1 |
x2 |
x3 |
0 |
A |
1 |
True |
1 |
B |
2 |
False |
2 |
C |
3 |
NaN |
a >> right_join(b, by='x1')
|
x1 |
x2 |
x3 |
0 |
A |
1.0 |
True |
1 |
B |
2.0 |
False |
2 |
D |
NaN |
True |
a >> semi_join(b, by='x1')
a >> anti_join(b, by='x1')
a >> bind_rows(b, join='inner')
|
x1 |
0 |
A |
1 |
B |
2 |
C |
0 |
A |
1 |
B |
2 |
D |
a >> bind_rows(b, join='outer')
c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\join.py:279: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=0)
|
x1 |
x2 |
x3 |
0 |
A |
1.0 |
NaN |
1 |
B |
2.0 |
NaN |
2 |
C |
3.0 |
NaN |
0 |
A |
NaN |
True |
1 |
B |
NaN |
False |
2 |
D |
NaN |
True |
a >> bind_cols(b)
|
x1 |
x2 |
x1 |
x3 |
0 |
A |
1 |
A |
True |
1 |
B |
2 |
B |
False |
2 |
C |
3 |
D |
True |
diamonds >> summarize(price_mean=X.price.mean(), price_std=X.price.std())
|
price_mean |
price_std |
0 |
3932.799722 |
3989.439738 |
diamonds >> group_by('cut') >> summarize(price_mean=X.price.mean(), price_std=X.price.std())
|
cut |
price_mean |
price_std |
0 |
Fair |
4358.757764 |
3560.386612 |
1 |
Good |
3928.864452 |
3681.589584 |
2 |
Ideal |
3457.541970 |
3808.401172 |
3 |
Premium |
4584.257704 |
4349.204961 |
4 |
Very Good |
3981.759891 |
3935.862161 |
diamonds >> summarize_each([np.mean, np.var], X.price, 'depth')
|
price_mean |
price_var |
depth_mean |
depth_var |
0 |
3932.799722 |
1.591533e+07 |
61.749405 |
2.052366 |
diamonds >> group_by(X.cut) >> summarize_each([np.mean, np.var], X.price, 4)
|
cut |
price_mean |
price_var |
depth_mean |
depth_var |
0 |
Fair |
4358.757764 |
1.266848e+07 |
64.041677 |
13.266319 |
1 |
Good |
3928.864452 |
1.355134e+07 |
62.365879 |
4.705224 |
2 |
Ideal |
3457.541970 |
1.450325e+07 |
61.709401 |
0.516274 |
3 |
Premium |
4584.257704 |
1.891421e+07 |
61.264673 |
1.342755 |
4 |
Very Good |
3981.759891 |
1.548973e+07 |
61.818275 |
1.900466 |
(diamonds >> mutate(price_lead=lead(X.price, 2), price_lag=lag(X.price, 2)) >>
select(X.price, -2, -1) >>
head(6))
|
price |
price_lead |
price_lag |
0 |
326 |
327.0 |
NaN |
1 |
326 |
334.0 |
NaN |
2 |
327 |
335.0 |
326.0 |
3 |
334 |
336.0 |
326.0 |
4 |
335 |
336.0 |
327.0 |
5 |
336 |
337.0 |
334.0 |
diamonds >> select(X.price) >> mutate(price_btwn=between(X.price, 330, 340)) >> head(6)
|
price |
price_btwn |
0 |
326 |
False |
1 |
326 |
False |
2 |
327 |
False |
3 |
334 |
True |
4 |
335 |
True |
5 |
336 |
True |
diamonds >> select(X.price) >> mutate(price_drank=dense_rank(X.price)) >> head(6)
|
price |
price_drank |
0 |
326 |
1.0 |
1 |
326 |
1.0 |
2 |
327 |
2.0 |
3 |
334 |
3.0 |
4 |
335 |
4.0 |
5 |
336 |
5.0 |
diamonds >> select(X.price) >> mutate(price_mrank=min_rank(X.price)) >> head(6)
|
price |
price_mrank |
0 |
326 |
1.0 |
1 |
326 |
1.0 |
2 |
327 |
3.0 |
3 |
334 |
4.0 |
4 |
335 |
5.0 |
5 |
336 |
6.0 |
diamonds >> select(X.price) >> mutate(price_cumsum=cumsum(X.price)) >> head(6)
|
price |
price_cumsum |
0 |
326 |
326 |
1 |
326 |
652 |
2 |
327 |
979 |
3 |
334 |
1313 |
4 |
335 |
1648 |
5 |
336 |
1984 |
diamonds >> select(X.price) >> mutate(price_cummean=cummean(X.price)) >> head(6)
|
price |
price_cummean |
0 |
326 |
326.000000 |
1 |
326 |
326.000000 |
2 |
327 |
326.333333 |
3 |
334 |
328.250000 |
4 |
335 |
329.600000 |
5 |
336 |
330.666667 |
diamonds >> select(X.price) >> mutate(price_cummax=cummax(X.price)) >> head(6)
|
price |
price_cummax |
0 |
326 |
326.0 |
1 |
326 |
326.0 |
2 |
327 |
327.0 |
3 |
334 |
334.0 |
4 |
335 |
335.0 |
5 |
336 |
336.0 |
diamonds >> select(X.price) >> mutate(price_cummin=cummin(X.price)) >> head(6)
|
price |
price_cummin |
0 |
326 |
326.0 |
1 |
326 |
326.0 |
2 |
327 |
326.0 |
3 |
334 |
326.0 |
4 |
335 |
326.0 |
5 |
336 |
326.0 |