pandas

#pandas
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
#Series
obj = pd.Series([4,7,-5,3])
obj
0    4
1    7
2   -5
3    3
dtype: int64
obj.values
array([ 4,  7, -5,  3], dtype=int64)
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2 = pd.Series([4,7,-5,3],index = ['d','b','a','c'])
obj2
d    4
b    7
a   -5
c    3
dtype: int64
obj2.index
Index(['d', 'b', 'a', 'c'], dtype='object')
#索引取值
obj2['a']
-5
obj2['d'] = 6
obj2[['c','a','d']]
c    3
a   -5
d    6
dtype: int64
obj2[obj2>0]
d    6
b    7
c    3
dtype: int64
obj2*2
d    12
b    14
a   -10
c     6
dtype: int64
np.exp(obj2)
d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
'b'in obj2
True
'r' in obj2
False
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata,index=states)
obj4
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
#缺失值
pd.isnull(obj4)
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
pd.notnull(obj4)
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
obj4.isnull()
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
#根据运算的索引标签自动对齐数据:
obj3 
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
obj4
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
obj3 + obj4
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
obj4.name = 'population'
obj4.index.name = 'state'
obj4
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64
obj
0    4
1    7
2   -5
3    3
dtype: int64
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64
#DdataFrame 表格型数据结构
data = data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
frame.head()  #前五行
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
pd.DataFrame(data,columns=['year','state','pop'])
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
5 2003 Nevada 3.2
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
frame2['state'] #返回Series
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object
frame2.year
one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64
frame2.loc['three']    #loc返回行数据
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
frame2['debt'] = 16.5
frame2
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5
six 2003 Nevada 3.2 16.5
frame2['debt'] = np.arange(6.)
frame2
year state pop debt
one 2000 Ohio 1.5 0.0
two 2001 Ohio 1.7 1.0
three 2002 Ohio 3.6 2.0
four 2001 Nevada 2.4 3.0
five 2002 Nevada 2.9 4.0
six 2003 Nevada 3.2 5.0
val = pd.Series([-1.2,-1.5,-1.7],index = ['two','four','five'])
frame2['debt'] = val
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
six 2003 Nevada 3.2 NaN
frame2['eastern'] = frame2.state == 'Ohio'
frame2
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False
six 2003 Nevada 3.2 NaN False
del frame2['eastern']
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
six 2003 Nevada 3.2 NaN
#嵌套字典
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
#外层字典的键作为列,内层键则作为行索引
frame3 = pd.DataFrame(pop)
frame3
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
#转置
frame3.T
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6
frame3.index.name = 'year';frame3.columns.name = 'state'
frame3
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
frame3.values
array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])
frame2.values
array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

pandas_第1张图片

#索引对象 Index
obj = pd.Series(range(3),index=['a','b','c'])
index = obj.index
index
Index(['a', 'b', 'c'], dtype='object')
index[1:]
Index(['b', 'c'], dtype='object')
labels = pd.Index(np.arange(3))
labels
Int64Index([0, 1, 2], dtype='int64')
obj2 = pd.Series([1.5,-2.5,0],index=labels)
obj2
0    1.5
1   -2.5
2    0.0
dtype: float64
obj2.index is labels
True
frame3
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
frame3.columns
Index(['Nevada', 'Ohio'], dtype='object')
'Ohio'in frame3.columns
True
dup_labels = pd.Index(['foo','foo', 'bar', 'bar'])
dup_labels
Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

pandas_第2张图片

#基本功能
#重新索引
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
obj3 = pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj3
0      blue
2    purple
4    yellow
dtype: object
obj3.reindex(range(6),method='ffill')#插值处理
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                         index=['a', 'c', 'd'],
                         columns=['Ohio', 'Texas', 'California'])
frame
Ohio Texas California
a 0 1 2
c 3 4 5
d 6 7 8
frame2 = frame.reindex(['a','b','c','d'])
frame2
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0
states = ['Texas','Utah','California']
frame.reindex(columns=states)
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8

pandas_第3张图片

#丢弃指定轴上的项 drop
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
new_obj = obj.drop('c')
new_obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
obj.drop(['d','c'])
a    0.0
b    1.0
e    4.0
dtype: float64
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                        index=['Ohio', 'Colorado', 'Utah', 'New York'],
                        columns=['one', 'two', 'three', 'four'])

data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data.drop(['Colorado','Ohio'])
one two three four
Utah 8 9 10 11
New York 12 13 14 15
data.drop('two',axis=1) #axis指定行或列
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15
data.drop(['two','four'],axis = 'columns')
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14
obj
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
obj.drop('c',inplace=True) #inplace销毁被删除数据
obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
#索引,选取和过滤
obj = pd.Series(np.arange(4),index=['a', 'b', 'c', 'd'])
obj
a    0
b    1
c    2
d    3
dtype: int32
obj['b']
1
obj[1]
1
obj[2:4]
c    2
d    3
dtype: int32
obj[['b','a','d']]
b    1
a    0
d    3
dtype: int32
obj[[1,3]]
b    1
d    3
dtype: int32
obj[obj<2]
a    0
b    1
dtype: int32
#利用标签的切片运算与普通的Python切片运算不同,其末端是包含的
obj['b':'c']
b    1
c    2
dtype: int32
obj['b':'c'] = 5
obj
a    0
b    5
c    5
d    3
dtype: int32
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                      index=['Ohio', 'Colorado', 'Utah', 'New York'],
                      columns=['one', 'two', 'three', 'four'])

data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data['two']
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
data[['three','one']]
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12
data[:2]
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
data[data['three']>5]
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data<5
one two three four
Ohio True True True True
Colorado True False False False
Utah False False False False
New York False False False False
data[data<5] = 0
data
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
#利用loc和iloc进行选取;标签运算符
data.loc['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int32
data.iloc[2,[3,0,1]]
four    11
one      8
two      9
Name: Utah, dtype: int32
data.iloc[2]
one       8
two       9
three    10
four     11
Name: Utah, dtype: int32
data.iloc[[1,2],[3,0,1]]
four one two
Colorado 7 0 5
Utah 11 8 9
data.loc[:'Utah','two']
Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32
data.iloc[:,:3][data.three>5]
one two three
Colorado 0 5 6
Utah 8 9 10
New York 12 13 14

pandas_第4张图片

#整数索引
ser = pd.Series(np.arange(3))
ser
0    0
1    1
2    2
dtype: int32
ser[-1]
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

 in ()
----> 1 ser[-1]


C:\Anaconda\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
    765         key = com._apply_if_callable(key, self)
    766         try:
--> 767             result = self.index.get_value(self, key)
    768 
    769             if not is_scalar(result):


C:\Anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
   3116         try:
   3117             return self._engine.get_value(s, k,
-> 3118                                           tz=getattr(series.dtype, 'tz', None))
   3119         except KeyError as e1:
   3120             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()


KeyError: -1
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c']) #非整数索引
ser2[-1]
2.0
ser[:1]
0    0
dtype: int32
ser.loc[:1] #注意区别
0    0
1    1
dtype: int32
#算术运算和数据对齐
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s1
a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
s2
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
s1 + s2  #对齐操作
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                    index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                       index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1
b c d
Ohio 0.0 1.0 2.0
Texas 3.0 4.0 5.0
Colorado 6.0 7.0 8.0
df2
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
df1 + df2 #DataFrame对象相加,没有共用的列或行标签,结果都会是空
b c d e
Colorado NaN NaN NaN NaN
Ohio 3.0 NaN 6.0 NaN
Oregon NaN NaN NaN NaN
Texas 9.0 NaN 12.0 NaN
Utah NaN NaN NaN NaN
#在算术方法中填充值
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                    columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                    columns=list('abcde'))  
df2.loc[1,'b'] = np.nan

df1
a b c d
0 0.0 1.0 2.0 3.0
1 4.0 5.0 6.0 7.0
2 8.0 9.0 10.0 11.0
df2
a b c d e
0 0.0 1.0 2.0 3.0 4.0
1 5.0 NaN 7.0 8.0 9.0
2 10.0 11.0 12.0 13.0 14.0
3 15.0 16.0 17.0 18.0 19.0
df1 + df2
a b c d e
0 0.0 2.0 4.0 6.0 NaN
1 9.0 NaN 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN
df1.add(df2,fill_value=0)   #指定填充值
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 5.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0

pandas_第5张图片

#DataFrame和Series之间的运算
arr = np.arange(12).reshape(3,4)
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
arr[0]
array([0, 1, 2, 3])
arr - arr[0]    #广播
array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                      columns=list('bde'),
                      index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series =frame.iloc[0]
frame
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
series
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
frame - series
b d e
Utah 0.0 0.0 0.0
Ohio 3.0 3.0 3.0
Texas 6.0 6.0 6.0
Oregon 9.0 9.0 9.0
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2
b d e f
Utah 0.0 NaN 3.0 NaN
Ohio 3.0 NaN 6.0 NaN
Texas 6.0 NaN 9.0 NaN
Oregon 9.0 NaN 12.0 NaN
series3 = frame['d']
frame
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
series3
Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64
frame.sub(series3,axis='index')
b d e
Utah -1.0 0.0 1.0
Ohio -1.0 0.0 1.0
Texas -1.0 0.0 1.0
Oregon -1.0 0.0 1.0
#函数应用和映射 ufuncs(元素级数组方法)
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                      index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
b d e
Utah -0.951265 -0.498273 -0.388690
Ohio 1.988546 0.370789 -0.488038
Texas 0.692938 -0.160944 0.654771
Oregon -1.314237 1.163286 -1.687210
np.abs(frame)
b d e
Utah 0.951265 0.498273 0.388690
Ohio 1.988546 0.370789 0.488038
Texas 0.692938 0.160944 0.654771
Oregon 1.314237 1.163286 1.687210
f = lambda x:x.max()-x.min()
frame.apply(f)  #默认列执行f
b    3.302783
d    1.661559
e    2.341980
dtype: float64
frame.apply(f,axis='columns')#行执行f
Utah      0.562574
Ohio      2.476585
Texas     0.853882
Oregon    2.850495
dtype: float64
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)     #接受多值的series函数
b d e
min -1.314237 -0.498273 -1.687210
max 1.988546 1.163286 0.654771
format = lambda x: '%.2f' % x
frame.applymap(format)      #元素级函数
b d e
Utah -0.95 -0.50 -0.39
Ohio 1.99 0.37 -0.49
Texas 0.69 -0.16 0.65
Oregon -1.31 1.16 -1.69
frame['e'].map(format)     #区分map与applymap
Utah      -0.39
Ohio      -0.49
Texas      0.65
Oregon    -1.69
Name: e, dtype: object
# 排序和排名
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
a    1
b    2
c    3
d    0
dtype: int64
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                       index=['three', 'one'],
                       columns=['d', 'a', 'b', 'c'])
frame.sort_index()


d a b c
one 4 5 6 7
three 0 1 2 3
frame.sort_index(axis=1,ascending=False)
d c b a
three 0 3 2 1
one 4 7 6 5
obj = pd.Series([4,7,-3,2])
obj.sort_values()
2   -3
3    2
0    4
1    7
dtype: int64
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
b a
0 4 0
1 7 1
2 -3 0
3 2 1
frame.sort_values(by='b')
b a
2 -3 0
3 2 1
0 4 0
1 7 1
frame.sort_values(by=['a','b'])
b a
2 -3 0
0 4 0
3 2 1
1 7 1
obj=pd.Series([7, -5, 7, 4, 2, 0, 4])#rank是通过“为各组分配一个平均排名”的方式破坏平级关系的
obj.rank()
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
obj.rank(method='first')
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
obj.rank(ascending=False, method='max')
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame
b a c
0 4.3 0 -2.0
1 7.0 1 5.0
2 -3.0 0 8.0
3 2.0 1 -2.5
frame.rank(axis='columns')
b a c
0 3.0 2.0 1.0
1 3.0 1.0 2.0
2 1.0 2.0 3.0
3 3.0 2.0 1.0

pandas_第6张图片

#带有重复标签的轴索引
obj = pd.Series(range(5),index=['a','a','b','b','c'])
obj
a    0
a    1
b    2
b    3
c    4
dtype: int64
obj.index.is_unique
False
obj['a']
a    0
a    1
dtype: int64
obj['b']
b    2
b    3
dtype: int64
df = pd.DataFrame(np.random.randn(4,3),index= ['a','a','b','b'])
df
0 1 2
a 1.265240 0.407293 -0.652129
a 0.268019 -1.423912 1.297783
b 0.797760 -0.353663 1.323543
b 0.961888 0.227132 1.843558
df.loc['b']
0 1 2
b 0.797760 -0.353663 1.323543
b 0.961888 0.227132 1.843558
#汇总和计算描绘统计
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                   index=['a', 'b', 'c', 'd'],
                   columns=['one', 'two'])
df

one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
df.sum()   #列和
one    9.25
two   -5.80
dtype: float64
df.sum(axis=1)
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
df.mean(axis='columns',skipna=False)   #不忽略Nan
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
df.idxmax()  #达到最大的索引
one    b
two    d
dtype: object
df.cumsum()
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
df.describe()
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)#非数值型
obj.describe()
count     16
unique     3
top        a
freq       8
dtype: object

pandas_第7张图片
pandas_第8张图片

#相关系数与协方差
import pandas_datareader.data as web
all_data = {ticker:web.get_data_yahoo(ticker)
           for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})
returns = price.pct_change()
returns.tail()
                          
AAPL IBM MSFT GOOG
Date
2018-10-18 -0.023374 -0.026110 -0.019962 -0.024846
2018-10-19 0.015230 -0.011107 0.001475 0.007804
2018-10-22 0.006110 0.007126 0.008927 0.004287
2018-10-23 0.009427 0.009152 -0.013956 0.002297
2018-10-24 -0.034302 -0.030486 -0.053469 -0.048003
returns['MSFT'].corr(returns['IBM']) #相关系数

0.4746674318628231
returns["MSFT"].cov(returns["IBM"])#协方差
8.150193655338736e-05
returns.MSFT.corr(returns.IBM)
0.4746674318628231
returns.corr()   #相关系数矩阵
AAPL IBM MSFT GOOG
AAPL 1.000000 0.364434 0.421984 0.438015
IBM 0.364434 1.000000 0.474667 0.398449
MSFT 0.421984 0.474667 1.000000 0.516364
GOOG 0.438015 0.398449 0.516364 1.000000
returns.cov()  #协方差矩阵
AAPL IBM MSFT GOOG
AAPL 0.000252 0.000070 0.000095 0.000106
IBM 0.000070 0.000146 0.000082 0.000073
MSFT 0.000095 0.000082 0.000202 0.000112
GOOG 0.000106 0.000073 0.000112 0.000232
returns.corrwith(returns.IBM)#与某一列或行的相关系数
AAPL    0.364434
IBM     1.000000
MSFT    0.474667
GOOG    0.398449
dtype: float64
returns.corrwith(volume)#传入一个DataFrame则会计算按列名配对的相关系数
AAPL   -0.065065
IBM    -0.173822
MSFT   -0.088563
GOOG   -0.016396
dtype: float64
#唯一值,值记述以及成员资格
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
obj.value_counts()#计算出现频率
c    3
a    3
b    2
d    1
dtype: int64
obj
0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object
mask = obj.isin(['b','c'])#用于判断矢量化集合的成员资格
mask
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool
obj[mask]
0    c
5    b
6    b
7    c
8    c
dtype: object
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_values = pd.Series(['c','b','a'])
pd.Index(unique_values).get_indexer(to_match)
#与isin类似的是Index.get_indexer方法,它可以给你一个索引数组,从可能包含重复值的数组到另一个不同值的数组
array([0, 2, 1, 1, 0, 2], dtype=int64)

pandas_第9张图片

你可能感兴趣的:(利用python进行数据分析)