#通过索引是视图 不是副本
1.导包
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
2.Series(数据类型)
2.1 创建Series
obj = Series([4,7,-5,3])
obj.values
array([ 4, 7, -5, 3], dtype=int64)
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2 = Series([4,7,-5,3],index=['d','b','a','c'])
sdata ={'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3 = Series(sdata)
2.2 取值
obj2['a']
obj2['a','b']
'b' in obj2
pd.isnull(obj4)
pd.notnull(obj4)
3.DataFrame(数据类型)
3.1 定义
DataFrame 是一个表格型的数据结构,具有行索引和列索引.每列的数据可以不相同,但是公用一个索引(行,列)
3.2 创建
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002],
'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
frame_test = DataFrame(data,index=['a','b','c','d','e'])
3.3 简单的增删改查
frame['state']
frame.state
frame['debt'] = np.arange(5)
frame['debt'] = 16.5
val = Series([2,3],index=[1,2])
frame.loc[1:2,'year'] = val
del frame['year']
3.4 索引对象(暂时不讲)
4.基本功能
4.1 重新索引
功能:pandas中的reindex方法可以为series和dataframe添加或者删除索引。
方法:serise.reindex()、dataframe.reindex()
如果新添加的索引在原数组的索引中不存在,则默认为nan。如果减少索引,就相当于一个切片操作。
obj = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
d 4.5
b 7.2
a -5.3
c 3.6
obj2 = obj.reindex(['a','b','c','d','e'],fill_value=0)
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6),method='ffill')
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
frame = DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],
columns=['Ohio','Texas','California'])
frame.reindex(columns=['Texas','Utah','California'])
method参数选项 |
说明 |
ffill或pad |
前向填充(或搬运)值 |
bfill或backfill |
后向填充(或搬运)值 |
reindex函数的参数
参数 |
说明 |
index |
行索引 |
column |
列索引 |
method |
插值(填充)方式 |
fill_value |
缺失值的替代值 |
limit |
前向或后向填充是的最大填充量 |
copy |
默认为True.无论如何都复制.如果为False,则新旧相等就不复制 |
frame.drop(['Ohio','Texas'],axis=1)
4.2 选取值
data = DataFrame(np.arange(16).reshape(4,4),
index=['Ohio','Colorado','Utah','New York'],
columns=['one','two','three','four'])
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data = DataFrame(np.arange(16).reshape(4,4),
index=['Ohio','Colorado','Utah','New York'],
columns=['one','two','three','four'])
data
|
one |
two |
three |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data.loc[['Colorado','Utah'],['two','three']]
|
two |
three |
Colorado |
5 |
6 |
Utah |
9 |
10 |
data.ix[['Colorado','Utah'],['two','three']]
D:\anaconda\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching an IPython kernel.
|
two |
three |
Colorado |
5 |
6 |
Utah |
9 |
10 |
DataFrame 索引选项
类型 |
说明 |
obj[val] |
选取DataFrame的一组列. |
obj.ix[val] |
选取单行或单列 |
reindex[val,val2] |
同时选多行或多列 |
obj.loc[val1,val2] |
同时选行和列 |
4.4 算术运算和数据对齐
df1 = DataFrame(np.arange(12).reshape(3,4),columns=list('abcd'))
df1
|
a |
b |
c |
d |
0 |
0 |
1 |
2 |
3 |
1 |
4 |
5 |
6 |
7 |
2 |
8 |
9 |
10 |
11 |
df2 = DataFrame(np.arange(20).reshape(4,5),columns=list('abcde'))
df2
|
a |
b |
c |
d |
e |
0 |
0 |
1 |
2 |
3 |
4 |
1 |
5 |
6 |
7 |
8 |
9 |
2 |
10 |
11 |
12 |
13 |
14 |
3 |
15 |
16 |
17 |
18 |
19 |
df1 + df2
|
a |
b |
c |
d |
e |
0 |
0.0 |
2.0 |
4.0 |
6.0 |
NaN |
1 |
9.0 |
11.0 |
13.0 |
15.0 |
NaN |
2 |
18.0 |
20.0 |
22.0 |
24.0 |
NaN |
3 |
NaN |
df1.add(df2,fill_value=0)
|
a |
b |
c |
d |
e |
0 |
0.0 |
2.0 |
4.0 |
6.0 |
4.0 |
1 |
9.0 |
11.0 |
13.0 |
15.0 |
9.0 |
2 |
18.0 |
20.0 |
22.0 |
24.0 |
14.0 |
3 |
15.0 |
16.0 |
17.0 |
18.0 |
19.0 |
4.5 DataFrame和Series之间的运算
frame = DataFrame(np.arange(12).reshape(4,3),columns=list('bde'),
index=['Utah','Ohio','Texas','Oregon'])
frame
|
b |
d |
e |
Utah |
0 |
1 |
2 |
Ohio |
3 |
4 |
5 |
Texas |
6 |
7 |
8 |
Oregon |
9 |
10 |
11 |
frame.loc['Utah']
b 0
d 1
e 2
Name: Utah, dtype: int32
frame.sub(frame.loc['Utah'],axis=1)
|
b |
d |
e |
Utah |
0 |
Ohio |
3 |
Texas |
6 |
Oregon |
9 |
frame.apply(lambda x:x.max()-x.min())
b 9
d 9
e 9
dtype: int64
frame.apply(lambda x:x.max()-x.min(),axis=1)
Utah 2
Ohio 2
Texas 2
Oregon 2
dtype: int64
frame.applymap(lambda x:'%.2f' %x)
|
b |
d |
e |
Utah |
0.00 |
1.00 |
2.00 |
Ohio |
3.00 |
4.00 |
5.00 |
Texas |
6.00 |
7.00 |
8.00 |
Oregon |
9.00 |
10.00 |
11.00 |
frame.sort_values(by='b',ascending=False)
|
b |
d |
e |
Oregon |
9 |
10 |
11 |
Texas |
6 |
7 |
8 |
Ohio |
3 |
4 |
5 |
Utah |
0 |
1 |
2 |
frame.sort_values(by='Utah',axis=1,ascending=False)
|
e |
d |
b |
Utah |
2 |
1 |
0 |
Ohio |
5 |
4 |
3 |
Texas |
8 |
7 |
6 |
Oregon |
11 |
10 |
9 |
frame=pd.DataFrame({'b':[5,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-3]})
frame
|
b |
a |
c |
0 |
5 |
0 |
-2 |
1 |
7 |
1 |
5 |
2 |
-3 |
0 |
8 |
3 |
2 |
1 |
-3 |
frame.rank(axis=0)
|
b |
a |
c |
0 |
3.0 |
1.5 |
2.0 |
1 |
4.0 |
3.5 |
3.0 |
2 |
1.0 |
1.5 |
4.0 |
3 |
2.0 |
3.5 |
1.0 |
frame.rank(axis=1)
|
b |
a |
c |
0 |
3.0 |
2.0 |
1.0 |
1 |
3.0 |
1.0 |
2.0 |
2 |
1.0 |
2.0 |
3.0 |
3 |
3.0 |
2.0 |
1.0 |
排名是用于破坏平级关系的method选项(如果两个数相等,表明在相同的分组)
method |
说明 |
average |
默认:在同组中,为各个值分配平均排名 |
min |
使用整个分组的最小排名 |
max |
使用整个分组的最大排名 |
first |
按值出现的先后顺序 |
4.6 带着重复值的轴索引
df = DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df
|
0 |
1 |
2 |
a |
-0.328041 |
1.907503 |
-0.896179 |
a |
-0.049003 |
-0.597808 |
0.744535 |
b |
-1.142489 |
-0.846847 |
-0.140037 |
b |
-1.154353 |
-0.947474 |
-1.460308 |
df.index.is_unique
False
df.loc['a']
|
0 |
1 |
2 |
a |
-0.328041 |
1.907503 |
-0.896179 |
a |
-0.049003 |
-0.597808 |
0.744535 |
df.iloc[1]
0 -0.049003
1 -0.597808
2 0.744535
Name: a, dtype: float64
4.7汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df
|
one |
two |
a |
1.40 |
NaN |
b |
7.10 |
-4.5 |
c |
NaN |
d |
0.75 |
-1.3 |
df.sum()
one 9.25
two -5.80
dtype: float64
df.sum(skipna=False)
one NaN
two NaN
dtype: float64
参数 |
说明 |
axis |
轴 |
skipna |
是否忽略缺失值,默认为True |
描述和汇总统计
函数 |
说明 |
count |
非NA值得数量 |
describe |
列的计算汇总统计 |
min,max |
最大值,最小值 |
argmin,argmax |
计算能获取到最大值和最小值的索引位置(整数) |
idxmin,idxmax |
计算能获取到最大值和最小值的索引位置 |
quantile |
计算样本的分位数(0到1) |
sum |
和 |
mean |
平均数 |
median |
值得算术中位数 |
mad |
根据平均值计算平均绝对离差 |
var |
方差 |
std |
标注差 |
skew |
偏度(三阶矩) |
kurt |
峰度(四阶矩) |
cumsum |
累计和 |
cummin,cummax |
累计最大值和累计最小值 |
cumprod |
累计积 |
diff |
计算一阶差分(对时间序列有用) |
pct_change |
计算百分数变化 |
obj = Series(['c','a','d','a','b','b','c','c'])
obj
0 c
1 a
2 d
3 a
4 b
5 b
6 c
7 c
dtype: object
obj.unique()
array(['c', 'a', 'd', 'b'], dtype=object)
obj.value_counts()
c 3
a 2
b 2
d 1
dtype: int64
obj.isin(['b','c'])
0 True
1 False
2 False
3 False
4 True
5 True
6 True
7 True
dtype: bool
5.缺失值处理
NA的处理方法
方法 |
说明 |
dropna |
根据各标签的值中是否存在缺失数据对轴标签进行过滤,可通过阈值调节对缺失值得容忍度 |
fillna |
用指定值或插值方法(ffill或bfill)填充缺失数据 |
isnull |
返回一个含有布尔值的对象,这些布尔值表示哪些值是缺失值NA |
notnull |
isull的否定式 |
5.1 滤除缺失数据
from numpy import nan as NA
data = DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])
data
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
2 |
NaN |
3 |
NaN |
6.5 |
3.0 |
data.dropna()
data.dropna(how='all')
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
3 |
NaN |
6.5 |
3.0 |
5.2 填充数据
df = DataFrame(np.random.randn(7,3))
df.iloc[:5,1] =NA;df.iloc[:3,2]=NA
df
|
0 |
1 |
2 |
0 |
1.734150 |
NaN |
1 |
0.706558 |
NaN |
2 |
-0.393785 |
NaN |
3 |
1.317984 |
NaN |
-2.853658 |
4 |
0.415997 |
NaN |
-0.125980 |
5 |
-0.154539 |
0.557347 |
1.042196 |
6 |
0.385543 |
-0.960659 |
-0.946296 |
df.fillna(0)
|
0 |
1 |
2 |
0 |
0.168608 |
0.000000 |
1 |
0.012919 |
0.000000 |
2 |
1.221231 |
0.000000 |
3 |
-1.515374 |
0.000000 |
0.774739 |
4 |
0.081386 |
0.000000 |
-0.327303 |
5 |
-0.193886 |
1.570973 |
1.057608 |
6 |
0.131576 |
0.623344 |
0.392884 |
df.fillna({1:0.5,2:-1})
|
0 |
1 |
2 |
0 |
0.210955 |
0.500000 |
-1.000000 |
1 |
0.666389 |
0.500000 |
-1.000000 |
2 |
0.488486 |
0.500000 |
-1.000000 |
3 |
2.166515 |
0.500000 |
0.736014 |
4 |
-0.702573 |
0.500000 |
-1.150182 |
5 |
0.310545 |
0.024061 |
-0.506138 |
6 |
0.729156 |
0.368345 |
1.721337 |
_ = df.fillna({1:0.5,2:-1},inplace=True)
df
|
0 |
1 |
2 |
0 |
1.734150 |
0.500000 |
-1.000000 |
1 |
0.706558 |
0.500000 |
-1.000000 |
2 |
-0.393785 |
0.500000 |
-1.000000 |
3 |
1.317984 |
0.500000 |
-2.853658 |
4 |
0.415997 |
0.500000 |
-0.125980 |
5 |
-0.154539 |
0.557347 |
1.042196 |
6 |
0.385543 |
-0.960659 |
-0.946296 |
6.层次化索引
data = Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data
a 1 -1.076458
2 1.126883
3 0.628407
b 1 1.704866
2 2.212012
3 -2.006192
c 1 0.153430
2 -1.172675
d 2 -2.626143
3 -2.760771
dtype: float64
data['b']
1 1.704866
2 2.212012
3 -2.006192
dtype: float64
data['b',2]
2.212012376812743
data.unstack()
|
1 |
2 |
3 |
a |
-1.076458 |
1.126883 |
0.628407 |
b |
1.704866 |
2.212012 |
-2.006192 |
c |
0.153430 |
-1.172675 |
NaN |
d |
NaN |
-2.626143 |
-2.760771 |
frame = DataFrame(np.arange(12).reshape(4,3),
index=[['a','a','b','b'],[1,2,1,2]],
columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame
|
Ohio |
Colorado |
|
Green |
Red |
Green |
a |
1 |
0 |
1 |
2 |
2 |
3 |
4 |
5 |
b |
1 |
6 |
7 |
8 |
2 |
9 |
10 |
11 |
frame = DataFrame({'a':range(7),'b':range(7,0,-1),
'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]})
frame
|
a |
b |
c |
d |
0 |
0 |
7 |
one |
0 |
1 |
1 |
6 |
one |
1 |
2 |
2 |
5 |
one |
2 |
3 |
3 |
4 |
two |
0 |
4 |
4 |
3 |
two |
1 |
5 |
5 |
2 |
two |
2 |
6 |
6 |
1 |
two |
3 |
frame2 = frame.set_index(['c','d'])
frame2
|
a |
b |
c |
d |
|
one |
0 |
0 |
7 |
1 |
1 |
6 |
2 |
2 |
5 |
two |
0 |
3 |
4 |
1 |
4 |
3 |
2 |
5 |
2 |
3 |
6 |
1 |
frame2 = frame.set_index(['c','d'],drop=False)
frame2
|
|
a |
b |
c |
d |
c |
d |
|
one |
0 |
0 |
7 |
one |
0 |
1 |
1 |
6 |
one |
1 |
2 |
2 |
5 |
one |
2 |
two |
0 |
3 |
4 |
two |
0 |
1 |
4 |
3 |
two |
1 |
2 |
5 |
2 |
two |
2 |
3 |
6 |
1 |
two |
3 |