dfply

from dfply import data
 
diamonds = data.diamonds
 
import pandas as pd
print(diamonds.loc[:,['cut','price','x']].head(3))
 
print('------------------------------------------')
 
print(diamonds[['cut','price','x']].head(3))
 
print(diamonds.iloc[:,[1,6,7]].head(3))
       cut  price     x
0    Ideal    326  3.95
1  Premium    326  3.89
2     Good    327  4.05
------------------------------------------
       cut  price     x
0    Ideal    326  3.95
1  Premium    326  3.89
2     Good    327  4.05
       cut  price     x
0    Ideal    326  3.95
1  Premium    326  3.89
2     Good    327  4.05
diamonds.iloc[[10,15],:].head(3)
carat cut color clarity depth table price x y z
10 0.30 Good J SI1 64.0 55.0 339 4.25 4.28 2.73
15 0.32 Premium E I1 60.9 58.0 345 4.38 4.42 2.68
print(diamonds.iloc[[10,15],:].head(3))
    carat      cut color clarity  depth  table  price     x     y     z
10   0.30     Good     J     SI1   64.0   55.0    339  4.25  4.28  2.73
15   0.32  Premium     E      I1   60.9   58.0    345  4.38  4.42  2.68
from dfply import *

diamonds >> drop(1, X.price, ['x', 'y']) >> head(2)
carat color clarity depth table z
0 0.23 E SI2 61.5 55.0 2.43
1 0.21 E SI1 59.8 61.0 2.31
# 'data' is the original pandas DataFrame
(diamonds >>
select(X.carat, X.cut, X.color) >>
drop(X.cut) >>
head(3))
carat color
0 0.23 E
1 0.21 E
2 0.23 E
diamonds.head()
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
diamonds >> drop(1, X.price, ['x', 'y']) >> head(5)
carat color clarity depth table z
0 0.23 E SI2 61.5 55.0 2.43
1 0.21 E SI1 59.8 61.0 2.31
2 0.23 E VS1 56.9 65.0 2.31
3 0.29 I VS2 62.4 58.0 2.63
4 0.31 J SI2 63.3 58.0 2.75
(diamonds >>
 select(X.carat, X.cut, X.color) >>
 select(~X.cut) >>
 head(3))

carat color
0 0.23 E
1 0.21 E
2 0.23 E
df = (diamonds >>
select(X.carat, X.cut, X.price) >>
mask(X.carat > 0.21,  X.cut=='Fair', X.price < 563))
df
carat cut price
8 0.22 Fair 337
10030 0.30 Fair 416
28270 0.25 Fair 361
31611 0.23 Fair 369
31615 0.27 Fair 371
40591 0.30 Fair 496
40597 0.34 Fair 497
46961 0.37 Fair 527
48630 0.30 Fair 536
df = (diamonds >>
      select(X.carat, X.cut, X.price) >>
      mask(X.carat > 0.21,  X.cut=='Fair', X.price < 563) >>
      arrange(X.carat,X.price))#按carat,然后按price排序
df
carat cut price
8 0.22 Fair 337
31611 0.23 Fair 369
28270 0.25 Fair 361
31615 0.27 Fair 371
10030 0.30 Fair 416
40591 0.30 Fair 496
48630 0.30 Fair 536
40597 0.34 Fair 497
46961 0.37 Fair 527
(diamonds >> 
select(starts_with('c')) >>
 head(2))
carat cut color clarity
0 0.23 Ideal E SI2
1 0.21 Premium E SI1
diamonds >> select(~starts_with('c')) >> head(2)
depth table price x y z
0 61.5 55.0 326 3.95 3.98 2.43
1 59.8 61.0 326 3.89 3.84 2.31
(diamonds >> 
     select(contains('c')) >>
      head(2))
carat cut color clarity price
0 0.23 Ideal E SI2 326
1 0.21 Premium E SI1 326
diamonds >>head(2)
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
diamonds >> select(columns_to(X.price)) >> head(2)#到某一行之前
carat cut color clarity depth table
0 0.23 Ideal E SI2 61.5 55.0
1 0.21 Premium E SI1 59.8 61.0
diamonds >> select(columns_from(X.table)) >> head(2)
table price x y z
0 55.0 326 3.95 3.98 2.43
1 61.0 326 3.89 3.84 2.31
diamonds >> select(columns_between(X.carat,X.table)) >> head(2)
carat cut color clarity depth table
0 0.23 Ideal E SI2 61.5 55.0
1 0.21 Premium E SI1 59.8 61.0
diamonds >> row_slice([10,15])#row_slice()函数选择切片行
carat cut color clarity depth table price x y z
10 0.30 Good J SI1 64.0 55.0 339 4.25 4.28 2.73
15 0.32 Premium E I1 60.9 58.0 345 4.38 4.42 2.68
diamonds >> sample(frac=0.0001, replace=False)
carat cut color clarity depth table price x y z
11849 1.04 Premium H VVS1 60.4 58.0 5102 6.58 6.53 3.96
22288 1.66 Ideal F SI2 62.6 59.0 10338 7.53 7.57 4.73
13016 1.20 Very Good I SI1 62.2 55.0 5408 6.75 6.92 4.25
2710 0.31 Ideal G SI1 61.6 55.0 563 4.34 4.37 2.68
25448 1.50 Premium F VS2 61.1 58.0 14199 7.36 7.33 4.49
diamonds >> sample(n=3, replace=True)
carat cut color clarity depth table price x y z
13775 1.70 Fair D I1 64.7 56.0 5617 7.46 7.37 4.80
6043 0.32 Ideal E SI2 62.0 55.0 576 4.41 4.36 2.72
33208 0.33 Ideal H IF 61.6 55.0 820 4.44 4.47 2.74
diamonds >> distinct(X.color)
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
7 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
12 0.22 Premium F SI1 60.4 61.0 342 3.88 3.84 2.33
25 0.23 Very Good G VVS2 60.4 58.0 354 3.97 4.01 2.41
28 0.23 Very Good D VS2 60.5 61.0 357 3.96 3.97 2.40
(diamonds
 >> filter_by(X.cut == 'Ideal', X.color == 'E', X.table < 55, X.price < 500)
 >> pull('carat'))
c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\subset.py:87: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  return df.ix[:, column]





26683    0.33
32297    0.34
40928    0.30
50623    0.30
50625    0.30
Name: carat, dtype: float64
diamonds >> mutate(x_plus_y=X.x + X.y) >> select(columns_from('x')) >> head(3)
x y z x_plus_y
0 3.95 3.98 2.43 7.93
1 3.89 3.84 2.31 7.73
2 4.05 4.07 2.31 8.12
diamonds >> mutate(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> select(columns_from('x')) >> head(3)
x y z x_plus_y y_div_z
0 3.95 3.98 2.43 7.93 1.637860
1 3.89 3.84 2.31 7.73 1.662338
2 4.05 4.07 2.31 8.12 1.761905
diamonds >> transmute(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> head(3)
x_plus_y y_div_z
0 7.93 1.637860
1 7.73 1.662338
2 8.12 1.761905
diamonds >> arrange(X.table, ascending=False) >> head(5)
carat cut color clarity depth table price x y z
24932 2.01 Fair F SI1 58.6 95.0 13387 8.32 8.31 4.87
50773 0.81 Fair F SI2 68.8 79.0 2301 5.26 5.20 3.58
51342 0.79 Fair G SI1 65.3 76.0 2362 5.52 5.13 3.35
52860 0.50 Fair E VS2 79.0 73.0 2579 5.21 5.18 4.09
49375 0.70 Fair H VS1 62.0 73.0 2100 5.65 5.54 3.47
(diamonds >> group_by(X.cut) >> arrange(X.price) >>
 head(3) >> ungroup() >> mask(X.carat < 0.23))
carat cut color clarity depth table price x y z
8 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
12 0.22 Premium F SI1 60.4 61.0 342 3.88 3.84 2.33
diamonds >> rename(CUT=X.cut, COLOR='color') >> head(2)
carat CUT COLOR clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z']) >> head(5)
#gather()函数将DataFrame中的指定列融合为两个键 :variable和value。
carat cut color clarity table variable value
0 0.23 Ideal E SI2 55.0 price 326.0
1 0.21 Premium E SI1 61.0 price 326.0
2 0.23 Good E VS1 65.0 price 327.0
3 0.29 Premium I VS2 58.0 price 334.0
4 0.31 Good J SI2 58.0 price 335.0
diamonds >> gather('variable', 'value') >> head(5)
variable value
0 carat 0.23
1 carat 0.21
2 carat 0.23
3 carat 0.29
4 carat 0.31
,,,
print(d)

a  b      c
0  1  a   True
1  2  b  False
2  3  c    NaN

d >> unite('united', X.a, 'b', 2, remove=False, na_action='maintain')

   a  b      c     united
0  1  a   True   1_a_True
1  2  b  False  2_b_False
2  3  c    NaN        NaN

d >> unite('united', ['a','b','c'], remove=True, na_action='ignore', sep='*')

      united
0   1*a*True
1  2*b*False
2        3*c

d >> unite('united', d.columns, remove=True, na_action='as_string')

      united
0   1_a_True
1  2_b_False
2    3_c_nan
''''''
  File "", line 4
    a  b      c
       ^
SyntaxError: invalid syntax
a = pd.DataFrame({
     
        'x1':['A','B','C'],
        'x2':[1,2,3]
    })
b = pd.DataFrame({
     
    'x1':['A','B','D'],
    'x3':[True,False,True]
})
a
x1 x2
0 A 1
1 B 2
2 C 3
b
x1 x3
0 A True
1 B False
2 D True
a >> inner_join(b, by='x1')

x1 x2 x3
0 A 1 True
1 B 2 False
a >> outer_join(b, by='x1')

x1 x2 x3
0 A 1.0 True
1 B 2.0 False
2 C 3.0 NaN
3 D NaN True
a >> left_join(b, by='x1')
x1 x2 x3
0 A 1 True
1 B 2 False
2 C 3 NaN
a >> right_join(b, by='x1')
x1 x2 x3
0 A 1.0 True
1 B 2.0 False
2 D NaN True
a >> semi_join(b, by='x1')
#returns all of the rows in the left DataFrame that have a match in the right DataFrame in the by columns.

x1 x2
0 A 1
1 B 2
a >> anti_join(b, by='x1')
x1 x2
2 C 3
a >> bind_rows(b, join='inner')
x1
0 A
1 B
2 C
0 A
1 B
2 D
a >> bind_rows(b, join='outer')
c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\join.py:279: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=0)
x1 x2 x3
0 A 1.0 NaN
1 B 2.0 NaN
2 C 3.0 NaN
0 A NaN True
1 B NaN False
2 D NaN True
a >> bind_cols(b)
x1 x2 x1 x3
0 A 1 A True
1 B 2 B False
2 C 3 D True
diamonds >> summarize(price_mean=X.price.mean(), price_std=X.price.std())
price_mean price_std
0 3932.799722 3989.439738
diamonds >> group_by('cut') >> summarize(price_mean=X.price.mean(), price_std=X.price.std())
#summarize() 可以很好的搭配group_by函数
cut price_mean price_std
0 Fair 4358.757764 3560.386612
1 Good 3928.864452 3681.589584
2 Ideal 3457.541970 3808.401172
3 Premium 4584.257704 4349.204961
4 Very Good 3981.759891 3935.862161
#summarize_each(function_list, *columns)更常用,它可以比较方便的计算数据框中的列的一些属性。
diamonds >> summarize_each([np.mean, np.var], X.price, 'depth')

price_mean price_var depth_mean depth_var
0 3932.799722 1.591533e+07 61.749405 2.052366
diamonds >> group_by(X.cut) >> summarize_each([np.mean, np.var], X.price, 4)
cut price_mean price_var depth_mean depth_var
0 Fair 4358.757764 1.266848e+07 64.041677 13.266319
1 Good 3928.864452 1.355134e+07 62.365879 4.705224
2 Ideal 3457.541970 1.450325e+07 61.709401 0.516274
3 Premium 4584.257704 1.891421e+07 61.264673 1.342755
4 Very Good 3981.759891 1.548973e+07 61.818275 1.900466
(diamonds >> mutate(price_lead=lead(X.price, 2), price_lag=lag(X.price, 2)) >>
            select(X.price, -2, -1) >>
            head(6))

#lead(series,n)函数向上推动向量中的值,在末尾位置添加NaN值。 同样,lag函数向下推动值,在初始位置插入NaN值。
price price_lead price_lag
0 326 327.0 NaN
1 326 334.0 NaN
2 327 335.0 326.0
3 334 336.0 326.0
4 335 336.0 327.0
5 336 337.0 334.0
diamonds >> select(X.price) >> mutate(price_btwn=between(X.price, 330, 340)) >> head(6)
price price_btwn
0 326 False
1 326 False
2 327 False
3 334 True
4 335 True
5 336 True
#dense_rank() 函数
#计算重复的排序值或者秩
diamonds >> select(X.price) >> mutate(price_drank=dense_rank(X.price)) >> head(6)

price price_drank
0 326 1.0
1 326 1.0
2 327 2.0
3 334 3.0
4 335 4.0
5 336 5.0
diamonds >> select(X.price) >> mutate(price_mrank=min_rank(X.price)) >> head(6)
price price_mrank
0 326 1.0
1 326 1.0
2 327 3.0
3 334 4.0
4 335 5.0
5 336 6.0
diamonds >> select(X.price) >> mutate(price_cumsum=cumsum(X.price)) >> head(6)
price price_cumsum
0 326 326
1 326 652
2 327 979
3 334 1313
4 335 1648
5 336 1984
diamonds >> select(X.price) >> mutate(price_cummean=cummean(X.price)) >> head(6)
#cummean() 函数计算列的累积平均值。
price price_cummean
0 326 326.000000
1 326 326.000000
2 327 326.333333
3 334 328.250000
4 335 329.600000
5 336 330.666667
diamonds >> select(X.price) >> mutate(price_cummax=cummax(X.price)) >> head(6)
price price_cummax
0 326 326.0
1 326 326.0
2 327 327.0
3 334 334.0
4 335 335.0
5 336 336.0
diamonds >> select(X.price) >> mutate(price_cummin=cummin(X.price)) >> head(6)
price price_cummin
0 326 326.0
1 326 326.0
2 327 326.0
3 334 326.0
4 335 326.0
5 336 326.0

你可能感兴趣的:(python)