1. apply
Series.
apply
(func, convert_dtype=True, args=(), **kwds)
>>> import pandas as pd
>>> import numpy as np
>>> series = pd.Series([20, 21, 12], index=['London',
... 'New York','Helsinki'])
>>> series
London 20
New York 21
Helsinki 12
dtype: int64
>>> def square(x):
... return x**2
>>> series.apply(square)
London 400
New York 441
Helsinki 144
dtype: int64
DataFrame.apply
(func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, args=(), **kwds)每次调用func的都是一个Series,
axis = 0: apply函数会自动遍历每一列DataFrame的数据,最后将所有结果组合成一个Series数据结构并返回
axis = 1: apply函数会自动遍历每一行DataFrame的数据,最后将所有结果组合成一个Series数据结构并返回
>>> df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
>>> df
A B
0 4 9
1 4 9
2 4 9
>>> df.apply(np.sum, axis=0)
A 12
B 27
dtype: int64
>>> df.apply(np.sum, axis=1)
0 13
1 13
2 13
dtype: int64
2. iloc, loc
>>> s2 = pd.Series(['a', 'b', 'c'], index=['one', 'two', 'three'])
>>> s2
one a
two b
three c
dtype: object
# loc是用字符串进行索引
>>> s2.loc['one']
'a'
# iloc是用整数进行索引
>>> s2.iloc[0]
'a'
# []用整数进行索引
>>> s2[0]
'a'
>>> s3 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=['one', 'two', 'three'])
>>> s3
a b
one 1 4
two 2 5
three 3 6
# loc与iloc都用来索引rows
>>> s3.iloc[0]
a 1
b 4
Name: one, dtype: int64
>>> s3.loc['one']
a 1
b 4
Name: one, dtype: int64
# []用来索引cols
>>> s3['a']
one 1
two 2
three 3
Name: a, dtype: int64
3. merge
DataFrame.
merge
(right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
>>> A >>> B
lkey value rkey value
0 foo 1 0 foo 5
1 bar 2 1 bar 6
2 baz 3 2 qux 7
3 foo 4 3 bar 8
>>> A.merge(B, left_on='lkey', right_on='rkey', how='outer')
lkey value_x rkey value_y
0 foo 1 foo 5
1 foo 4 foo 5
2 bar 2 bar 6
3 bar 2 bar 8
4 baz 3 NaN NaN
5 NaN NaN qux 7
4. sort_values, sort_index
DataFrame.
sort_values
(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
>>> p1 = pd.DataFrame({'col1': [1, 2, 3],
... 'col2': [4, 5, 6],
... 'col3': [7, 8, 9]})
>>> p1
col1 col2 col3
0 1 4 7
1 2 5 8
2 3 6 9
>>> p1.sort_values(by=0, ascending=False, axis=1)
col3 col2 col1
0 7 4 1
1 8 5 2
2 9 6 3
>>> p1.sort_values(by='col1', ascending=False, axis=0)
col1 col2 col3
2 3 6 9
1 2 5 8
0 1 4 7
DataFrame.sort_index
(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None)
>>> p1
col1 col2 col3
0 1 4 7
1 2 5 8
2 3 6 9
>>> p1.sort_index(axis=0, ascending=False)
col1 col2 col3
2 3 6 9
1 2 5 8
0 1 4 7
>>> p1.sort_index(axis=1, ascending=False)
col3 col2 col1
0 7 4 1
1 8 5 2
2 9 6 3
5. idxmax
Series.
idxmax
(axis=0, skipna=True, *args, **kwargs)
>>> s = pd.Series(data=[1, None, 4, 3, 4],
... index=['A', 'B', 'C', 'D', 'E'])
>>> s
A 1.0
B NaN
C 4.0
D 3.0
E 4.0
dtype: float64
>>> s.idxmax()
'C'
DataFrame.
idxmax
(axis=0, skipna=True)
>>> p2
col1 col2 col3
0 1 6 2
1 2 1 8
2 3 0 1
>>> p2.idxmax()
col1 2
col2 0
col3 1
dtype: int64
>>> p2.idxmax(axis=1)
0 col2
1 col3
2 col1
dtype: object
6. 新构建的特征需要转换成numpy类型
result['citable doc per person'] = (result['Citable documents'] / result['Population']).astype(np.float64)
7.corr
Series.
corr
(other, method='pearson', min_periods=None)
>>> p2
col1 col2 col3
0 1 6 2
1 2 1 8
2 3 0 1
>>> p2.col2.corr(p2.col3)
-0.23281119015753007
DataFrame.
corr
(method='pearson', min_periods=1)
>>> p2
col1 col2 col3
0 1 6 2
1 2 1 8
2 3 0 1
>>> p2.corr()
col1 col2 col3
col1 1.000000 -0.933257 -0.132068
col2 -0.933257 1.000000 -0.232811
col3 -0.132068 -0.232811 1.000000
8. map
接收函数作为或字典对象作为参数,返回经过函数或字典映射处理后的值
>>> p2
col1 col2 col3
0 1 6 2
1 2 1 8
2 3 0 1
>>> mapp1 = {1: 10, 2: 12}
>>> p2.col1.map(mapp1)
0 10.0
1 12.0
2 NaN
Name: col1, dtype: float64
>>> p2
col1 col2 col3
0 1 6 2
1 2 1 8
2 3 0 1
>>> p2.col2.map(lambda x: x+1)
0 7
1 2
2 1
Name: col2, dtype: int64
9. agg
DataFrameGroupBy.
agg
(arg, *args, **kwargs)
>>> df
A B C
0 1 1 0.362838
1 1 2 0.227877
2 2 3 1.267767
3 2 4 -0.562860
>>> df.groupby('A').agg(['min', 'max'])
B C
min max min max
A
1 1 2 0.227877 0.362838
2 3 4 -0.562860 1.267767
10. cut, qcut
pandas.
cut
(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise')
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]
>>>
pandas.
qcut
(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
>>> pd.qcut(range(5), 4)
[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]]