利用Python进行数据分析的学习笔记——chap5

pandas的数据结构介绍

from pandas import Series,DataFrame
import pandas as pd
import numpy as np

Series
(索引在左边,值在右边。可看作是一个定长的有序字典)

obj = Series([4,7,-5,3])
obj
0    4
1    7
2   -5
3    3
dtype: int64
#通过Series的values和index属性获取其数组表示形式和索引对象。
obj.values
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2 = Series([4,7,-5,3],index=['d','b','a','c'])
obj2.index
Index(['d', 'b', 'a', 'c'], dtype='object')
#一些基本操作
obj2[obj2 > 0]
obj2 * 2
np.exp(obj2)
'b' in obj2
'e' in obj2
False
sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
#用字典来创建Series
obj3 = Series(sdata)
states = ['California','Ohio','Oregon','Texas']
obj4 = Series(sdata,index=states)
obj4
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
#检测缺失值
pd.isnull(obj4)
pd.notnull(obj4)
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
#Series在算术运算中会自动对齐不同索引的数据
obj3 + obj4
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
#Series的name属性
obj4.name = 'population'
obj4.index.name = 'state'
obj4
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64
#Series的索引可以通过赋值的方式就地修改
obj.index = ['Bob','Steve','Jeff','Ryan']
obj
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

DataFrame

#构建DataFrame
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
       'year':[2000,2001,2002,2001,2002],
       'pop':[1.5,1.7,3.6,2.4,2.9]}
#DataFrame会自动加上索引,且全部列会被有序排列
frame = DataFrame(data)
frame
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
#指定列序列,按照该顺序进行排列
DataFrame(data,columns=['year','state','pop'])
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
#如果传入的列在数据中找不到,就会产生NA值
frame2 = DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five'])
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
#将DataFrame的列获取为一个Series
frame2['state']
frame2.year
#通过位置或名称的方式获取行
frame2.loc['three']
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
#列可以通过赋值的方式进行修改。标量或一组值
frame2['debt'] = 16.5
frame2['debt'] = np.arange(5.)
frame2
year state pop debt
one 2000 Ohio 1.5 0.0
two 2001 Ohio 1.7 1.0
three 2002 Ohio 3.6 2.0
four 2001 Nevada 2.4 3.0
five 2002 Nevada 2.9 4.0
#如果赋值的是一个Series,会精确匹配DataFrame的索引,所有的空位都将被填上缺失值
val = Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2['debt'] = val
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
#为不存在的列赋值会创建出一个新列
frame2['eastern'] = frame2.state == 'Ohio'
frame2
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False
#删除列
del frame2['eastern']
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
#嵌套字典
pop = {'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
#创建DataFrame。外层字典的键作为列,内层键作为行索引
frame3 = DataFrame(pop)
#转置
frame3.T
2001 2002 2000
Nevada 2.4 2.9 NaN
Ohio 1.7 3.6 1.5

#有个知识点
利用Python进行数据分析的学习笔记——chap5_第1张图片

#设置DataFrame的index和columns的name属性
frame3.index.name = 'year';frame3.columns.name = 'state'
frame3.values
#各列数据类型不同,则值数组的数据类型会选用能兼容所有列的数据类型
frame2.values
array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

索引对象

Index对象是不可修改的

obj =Series(range(3),index=['a','b','c'])
index = obj.index
index[1:]
Index(['b', 'c'], dtype='object')
index = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0],index=index)
obj2.index is index
True

#又有个知识点
利用Python进行数据分析的学习笔记——chap5_第2张图片

'Ohio' in frame3.columns
2003 in frame3.index
False

#有个知识点
利用Python进行数据分析的学习笔记——chap5_第3张图片

重新索引

obj = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj2 = obj.reindex(['a','b','c','d','e'])
obj2
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
obj.reindex(['a','b','c','d','e'],fill_value=0)
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
#ffill实现前向值填充
obj3.reindex(range(6),method='ffill')
#ffill或pad   前向填充(或搬运)值
#bfill或backfill  后向填充(或搬运)值
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
#reindex对DataFrame的修改
frame = DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Ohio','Texas','California'])
frame2 = frame.reindex(['a','b','c','d'])
frame2
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0
#使用columns关键字可重新索引列
states = ['Texas','Utah','California']
frame.reindex(columns=states)
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
#同时对行和列进行重新索引,而插值只能按行应用(即轴0)
#frame.reindex(index=['a','b','c','d'],method='ffill',columns=states)会报错
frame.reindex(index=['a','b','c','d'],columns=states).ffill()
Texas Utah California
a 1.0 NaN 2.0
b 1.0 NaN 2.0
c 4.0 NaN 5.0
d 7.0 NaN 8.0
#利用ix(被淘汰了)换成loc,继续重新索引
#frame.loc[['a','b','c','d'],states]
#KeyError: "['b'] not in index"
frame = frame.reindex(['a','b','c','d'])
frame = frame.reindex(columns=states)
frame.loc[['a','b','c','d'],states]
#问题是直接frame.reindex并不会改变原来的frame,也就是说frame.reindex返回的是副本,而不是视图(?)
#reindex参数copy默认是T(无论如何都复制),改为F是新旧相等就不复制。
Texas Utah California
a 1.0 NaN 2.0
b NaN NaN NaN
c 4.0 NaN 5.0
d 7.0 NaN 8.0

#又又又有个知识点
利用Python进行数据分析的学习笔记——chap5_第4张图片

丢弃指定轴上的项

obj = Series(np.arange(5.),index=['a','b','c','d','e'])
#删除指定值的新对象
new_obj = obj.drop('c')
new_obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
#对于DataFrame
data =DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data.drop(['Colorado','Ohio'])
one two three four
Utah 8 9 10 11
New York 12 13 14 15
#删除指定列
data.drop('two',axis=1)
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15

索引、选取和过滤

obj = Series(np.arange(4.),index=['a','b','c','d'])
obj['b']
#等价于
obj[1]
1.0
obj[obj<2]
a    0.0
b    1.0
dtype: float64
#利用标签的切片运算与普通的python切片运算不同,其末端是包含的
obj['b':'c']
b    1.0
c    2.0
dtype: float64
data = DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data[['three','one']]
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12
data[:2]
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
#选取three大于5的所有行所有列
data[data['three']>5]
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data[data<5] = 0
data
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data.loc['Colorado',['two','three']]
two      5
three    6
Name: Colorado, dtype: int32
#选取three中>5的前三列
#data.loc[data.three>5,:3]
#TypeError: cannot do slice indexing on Index with these indexers [3] of type int
data.loc[data.three>5,:'three']
one two three
Colorado 0 5 6
Utah 8 9 10
New York 12 13 14

#又又又又有个知识点
利用Python进行数据分析的学习笔记——chap5_第5张图片

算术运算和数据对齐

s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1+s2
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
df1+df2
b c d e
Colorado NaN NaN NaN NaN
Ohio 3.0 NaN 6.0 NaN
Oregon NaN NaN NaN NaN
Texas 9.0 NaN 12.0 NaN
Utah NaN NaN NaN NaN

在算术方法中填充值

add 加法
sub 减法
div 除法
mul 乘法

df1 = DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde'))
#当一个对象中某个轴标签在另一个对象中找不到时填充一个特殊值(比如0)
df1.add(df2,fill_value=0)
#类似地,对重新索引也可以指定一个填充值
df1.reindex(columns=df2.columns,fill_value=0)
a b c d e
0 0.0 1.0 2.0 3.0 0
1 4.0 5.0 6.0 7.0 0
2 8.0 9.0 10.0 11.0 0

DataFrame和Series之间的运算

#计算一个二维数组与其某行之间的差
arr = np.arange(12.).reshape((3,4))
#广播
arr-arr[0]
array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])
frame = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
series = frame.loc['Utah']#series = frame.ix[0]
frame-series
b d e
Utah 0.0 0.0 0.0
Ohio 3.0 3.0 3.0
Texas 6.0 6.0 6.0
Oregon 9.0 9.0 9.0
#若某个索引值找不到,则参与运算的两个对象就会被重新索引形成并集
series2 = Series(range(3),index=['b','e','f'])
frame+series2
b d e f
Utah 0.0 NaN 3.0 NaN
Ohio 3.0 NaN 6.0 NaN
Texas 6.0 NaN 9.0 NaN
Oregon 9.0 NaN 12.0 NaN
#匹配行在列上广播
series3 = frame['d']
frame.sub(series3,axis=0)
b d e
Utah -1.0 0.0 1.0
Ohio -1.0 0.0 1.0
Texas -1.0 0.0 1.0
Oregon -1.0 0.0 1.0

函数应用和映射

frame = DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
np.abs(frame)
f = lambda x:x.max()-x.min()
frame.apply(f)
frame.apply(f,axis=1)
Utah      2.341364
Ohio      1.889609
Texas     1.479165
Oregon    1.122892
dtype: float64
def f(x):
    return Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)
b d e
min -2.121313 -0.231704 -0.309807
max 1.255592 0.670467 1.786023
#得到各个浮点值的格式化字符串
format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)
Utah       1.79
Ohio      -0.31
Texas      1.42
Oregon     0.13
Name: e, dtype: object

排序和排名

obj = Series(range(4),index=['d','a','b','c'])
obj.sort_index()
a    1
b    2
c    3
d    0
dtype: int64
#升序排序
frame = DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c'])
frame.sort_index()
d a b c
one 4 5 6 7
three 0 1 2 3
frame.sort_index(axis=1)
a b c d
three 1 2 3 0
one 5 6 7 4
#降序排序
frame.sort_index(axis=1,ascending=False)
d c b a
three 0 3 2 1
one 4 7 6 5
#按值对Series进行排序,可用order方法
#Python3.6之后的版本已经没有order属性了,尝试使用sort_values()方法就好了。
obj = Series([4,7,-3,2])
obj.sort_values()
2   -3
3    2
0    4
1    7
dtype: int64
#排序时,任何缺失值默认都会被放到Series的末尾
obj = Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
#在DataFrame上,要根据一个或多个列中的值进行排序
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
#frame.sort_index(by='b')
frame.sort_values(by='b')
b a
2 -3 0
3 2 1
0 4 0
1 7 1
frame.sort_values(by=['a','b'])
b a
2 -3 0
0 4 0
3 2 1
1 7 1
obj = Series([7,-5,7,4,2,0,4])
obj.rank()
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
#根据值在原数据中出现的顺序给出排名
obj.rank(method='first')
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
#按降序排名
obj.rank(ascending=False,method='max')
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

#又又又又又有个知识点
利用Python进行数据分析的学习笔记——chap5_第6张图片

带有重复值的轴索引

obj = Series(range(5),index=['a','a','b','b','c'])
#判断索引值是否唯一
obj.index.is_unique
False
df = DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df.loc['b']
0 1 2
b -0.994941 0.304769 0.930754
b 0.918218 0.577393 2.664499

汇总和计算描述统计

df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
#按列求和
#默认skipna=True,即NA值会自动被排除
df.sum()
#按行求和
df.sum(axis=1)
a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64
df.mean(axis=1,skipna=False)
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

#又又又又又又有个知识点
利用Python进行数据分析的学习笔记——chap5_第7张图片

#返回达到最大值的索引
df.idxmax()
one    b
two    d
dtype: object
#累计型
df.cumsum()
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
#一次性产生多个汇总统计
df.describe()
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
#对于非数值型数据
obj = Series(['a','a','b','c']*4)
obj.describe()
count     16
unique     3
top        a
freq       8
dtype: object

#又又又又又又又有个知识点
利用Python进行数据分析的学习笔记——chap5_第8张图片
利用Python进行数据分析的学习笔记——chap5_第9张图片

相关系数与协方差

#import pandas.io.data as web
import pandas_datareader.data as web
#px=web.DataReader('F-F_Research_Data_factors','famafrench')
all_data = {}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
price = DataFrame({tic:data['Adj Close'] for tic,data in all_data.iteritems()})
volume = DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()})

报错。

price = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl')
volume = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_volume.pkl')
price.head()
AAPL GOOG IBM MSFT
Date
2010-01-04 27.990226 313.062468 113.304536 25.884104
2010-01-05 28.038618 311.683844 111.935822 25.892466
2010-01-06 27.592626 303.826685 111.208683 25.733566
2010-01-07 27.541619 296.753749 110.823732 25.465944
2010-01-08 27.724725 300.709808 111.935822 25.641571
volume.head()
AAPL GOOG IBM MSFT
Date
2010-01-04 123432400 3927000 6155300 38409100
2010-01-05 150476200 6031900 6841400 49749600
2010-01-06 138040000 7987100 5605300 58182400
2010-01-07 119282800 12876600 5840600 50559700
2010-01-08 111902700 9483900 4197200 51197400
#计算价格的百分比变化
returns = price.pct_change()
returns.tail()
AAPL GOOG IBM MSFT
Date
2016-10-17 -0.000680 0.001837 0.002072 -0.003483
2016-10-18 -0.000681 0.019616 -0.026168 0.007690
2016-10-19 -0.002979 0.007846 0.003583 -0.002255
2016-10-20 -0.000512 -0.005652 0.001719 -0.004867
2016-10-21 -0.003930 0.003011 -0.012474 0.042096
#Series的corr方法用于计算两个Series中重叠的、非NA的、按索引对齐的值的相关系数。cov用于计算协方差
returns.MSFT.corr(returns.IBM)
0.49976361144151155
returns.MSFT.cov(returns.IBM)
8.870655479703546e-05
#对于DataFrame数据
returns.corr()
AAPL GOOG IBM MSFT
AAPL 1.000000 0.407919 0.386817 0.389695
GOOG 0.407919 1.000000 0.405099 0.465919
IBM 0.386817 0.405099 1.000000 0.499764
MSFT 0.389695 0.465919 0.499764 1.000000
returns.cov()
AAPL GOOG IBM MSFT
AAPL 0.000277 0.000107 0.000078 0.000095
GOOG 0.000107 0.000251 0.000078 0.000108
IBM 0.000078 0.000078 0.000146 0.000089
MSFT 0.000095 0.000108 0.000089 0.000215
#DataFrame的列或行与Series数据或DataFrame之间的相关系数
returns.corrwith(returns.IBM)
AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64
#传入一个DataFrame则会计算按列名配对的相关系数。若axis=1则是按行。
returns.corrwith(volume)
AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

唯一值、值计数以及成员资格

isin 计算一个表示“Series各值是否包含于传入的值序列中”的布尔型数组;
unique 计算Series中的唯一值数组,按发现的顺序返回;
value_counts 返回一个Series,其索引为唯一值,其值为频率,按计数值降序排列

obj = Series(['c','a','d','a','a','b','b','c','c'])
#得到Series中唯一值数组
uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
#各值出现的频率
obj.value_counts()
c    3
a    3
b    2
d    1
dtype: int64
#value_counts可用于任何数组或序列
pd.value_counts(obj.values,sort=False)
c    3
a    3
d    1
b    2
dtype: int64
#判断矢量化集合的成员资格
mask = obj.isin(['b','c'])
obj[mask]
0    c
5    b
6    b
7    c
8    c
dtype: object
data = DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})
result = data.apply(pd.value_counts).fillna(0)
result
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 0.0 2.0 1.0
3 2.0 2.0 0.0
4 2.0 0.0 2.0
5 0.0 0.0 1.0

处理缺失数据

pandas对象上的所有描述统计都排除了缺失数据

string_data = Series(['aardvark','artichoke',np.nan,'avocado'])
string_data.isnull()
0    False
1    False
2     True
3    False
dtype: bool
#python内置的None值也会被当作NA处理
string_data[0] = None
string_data.isnull()
0     True
1    False
2     True
3    False
dtype: bool

#又又又又又又又又有个知识点
利用Python进行数据分析的学习笔记——chap5_第10张图片

滤除缺失值

from numpy import nan as NA
data = Series([1,NA,3.5,NA,7])
data.dropna()
0    1.0
2    3.5
4    7.0
dtype: float64
data[data.notnull()]
0    1.0
2    3.5
4    7.0
dtype: float64
#dropna默认丢弃任何含有缺失值的行
data = DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
cleaned = data.dropna()
cleaned
0 1 2
0 1.0 6.5 3.0
#只丢弃全为NA的行
data.dropna(how='all')
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
#丢弃列的操作
data[4] = NA
data.dropna(axis=1,how='all')
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
#只留下一部分观测数据的操作
df = DataFrame(np.random.randn(7,3))
df.loc[:4,1] = NA;df.loc[:2,2] = NA
#thresh:非空元素最低数量。int型,默认为None。如果该行/列中,非空元素数量小于这个值,就删除该行/列。
df.dropna(thresh=3)
0 1 2
5 -1.991372 -1.644575 0.675400
6 1.718451 0.312742 -1.484959

填充缺失数据

df.fillna(0)
0 1 2
0 -0.292230 0.000000 0.000000
1 0.129826 0.000000 0.000000
2 -0.075307 0.000000 0.000000
3 0.280476 0.000000 -1.259970
4 -1.171738 0.000000 0.206481
5 -1.991372 -1.644575 0.675400
6 1.718451 0.312742 -1.484959
#使用字典,实现对不同的列填充不同的值
df.fillna({1:0.5,3:-1})
0 1 2
0 -0.292230 0.500000 NaN
1 0.129826 0.500000 NaN
2 -0.075307 0.500000 NaN
3 0.280476 0.500000 -1.259970
4 -1.171738 0.500000 0.206481
5 -1.991372 -1.644575 0.675400
6 1.718451 0.312742 -1.484959
#fillna默认返回新对象,但也可以对现有对象进行就地修改
#总是返回被填充对象的引用
_ = df.fillna(0,inplace=True)
df
0 1 2
0 -0.292230 0.000000 0.000000
1 0.129826 0.000000 0.000000
2 -0.075307 0.000000 0.000000
3 0.280476 0.000000 -1.259970
4 -1.171738 0.000000 0.206481
5 -1.991372 -1.644575 0.675400
6 1.718451 0.312742 -1.484959

df = DataFrame(np.random.randn(6,3))
df.loc[2:,1] = NA;df.loc[4:,2] = NA
df.fillna(method='ffill',limit=2)
0 1 2
0 -0.814015 -1.672914 -0.437364
1 0.294209 0.038563 -0.141332
2 -0.337091 0.038563 -0.041438
3 0.698458 0.038563 -0.750640
4 -0.369432 NaN -0.750640
5 -0.437763 NaN -0.750640
data = Series([1.,NA,3.5,NA,7])
data.fillna(data.mean())
0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

#又又又又又又又又又有个知识点
利用Python进行数据分析的学习笔记——chap5_第11张图片

层次化索引

能使一个轴上拥有多个索引级别,能以低维度形式处理高维度数据

data = Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data
a  1    0.861096
   2    0.613551
   3    1.130427
b  1   -0.210724
   2    0.962846
   3    0.393051
c  1   -0.774183
   2    0.456655
d  2   -0.824490
   3    0.908530
dtype: float64
data.index
MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )
#选取子集
data['b']
1   -0.210724
2    0.962846
3    0.393051
dtype: float64
data.loc[['b','d']]#不是data.loc['b','d']
b  1   -0.210724
   2    0.962846
   3    0.393051
d  2   -0.824490
   3    0.908530
dtype: float64
#在内层中进行选取
data[:,2]
a    0.613551
b    0.962846
c    0.456655
d   -0.824490
dtype: float64
#通过unstack将Series数据变为DataFrame数据
data.unstack()
1 2 3
a 0.861096 0.613551 1.130427
b -0.210724 0.962846 0.393051
c -0.774183 0.456655 NaN
d NaN -0.824490 0.908530
data.unstack().stack()
a  1    0.861096
   2    0.613551
   3    1.130427
b  1   -0.210724
   2    0.962846
   3    0.393051
c  1   -0.774183
   2    0.456655
d  2   -0.824490
   3    0.908530
dtype: float64
#对于DataFrame数据,每条轴都可以有分层索引。索引名称和轴标签不一样。
frame = DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
frame['Ohio']
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10
#另一种创建方法
MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Green']],names=['state','color'])

大概这个意思

重排分级顺序

#swaplevel接受两个级别编号或称号,并返回一个互换了级别的新对象
frame.swaplevel('key1','key2')
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
#根据单个级别中的值对数据进行排序
#frame.sortlevel(1)
# 'DataFrame' object has no attribute 'sortlevel'无该函数了
frame.sort_values(axis=0,by='key2')
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
frame.swaplevel(0,1).sort_index()
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11

在层次化索引的对象上,如果索引是按字典方式从外到内排序(即调用sort_index),数据选取操作的性能要好很多。

根据级别汇总统计

#frame.sum(level='key2')这个未来会弃用
frame.groupby(level='key2').sum()
state Ohio Colorado
color Green Red Green
key2
1 6 8 10
2 12 14 16
frame.groupby(level='color',axis=1).sum()
color Green Red
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10

使用DataFrame的列

frame = DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
#set_index可将一个或多个列转换为行索引,并创建一个新的DataFrame
frame2 = frame.set_index(['c','d'])
frame2
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
# 也可以将那些列保留下来
frame.set_index(['c','d'],drop=False)
#将层次化索引的级别转移到列里面
frame2.reset_index()
c d a b
0 one 0 0 7
1 one 1 1 6
2 one 2 2 5
3 two 0 3 4
4 two 1 4 3
5 two 2 5 2
6 two 3 6 1

其他有关pandas的话题

整数索引

#整数索引
ser = Series(np.arange(3.))
#ser[-1]会报错
#非整数索引
ser2 = Series(np.arange(3.),index=['a','b','c'])
ser2[-1]
2.0
#面向轴标签的索引
# df.loc的第一个参数是行标签,第二个参数为列标签(可选参数,默认为所有列标签),
# 两个参数既可以是列表也可以是单个字符,
# 如果两个参数都为列表则返回的是DataFrame,否则,则为Series。
ser.loc[:1]
0    0.0
1    1.0
dtype: float64
#可靠的、不考虑索引类型的、基于位置的索引
ser3 = Series(range(3),index=[-5,1,3])
#ser3.iget_value(2)被取代了
ser3.iat[2]
2
frame = DataFrame(np.arange(6).reshape(3,2),index=[2,0,1])
#frame.irow(0)被舍弃
frame.iloc[0]
#等价
frame.iloc[0,:]
0    0
1    1
Name: 2, dtype: int32

面板数据

可以用一个由DataFrame对象组成的字典或一个三维ndarray来创建Panel对象

price = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl')
type(price)
pandas.core.frame.DataFrame
price.loc['6/1/2012']
AAPL     73.371509
GOOG    285.205295
IBM     168.989059
MSFT     25.262972
Name: 2012-06-01 00:00:00, dtype: float64
#用堆积式的DataFrame方法呈现面板数据
stacked=price.loc['6/1/2012'].to_frame()
stacked
2012-06-01
AAPL 73.371509
GOOG 285.205295
IBM 168.989059
MSFT 25.262972
pd.Panel(stacked)
#Panel被移除了。
AttributeError: module 'pandas' has no attribute 'Panel'

你可能感兴趣的:(笔记,python,开发语言,后端)