DataFrame 是一个表格行的数据结构,含有一组有序的列,DateFrame 可以被看作是 Series 组成的字典并且公用一个索引。
创建方式:
pd.DataFrame({'one':[1,2,3,4], 'two':[5,4,3,2]})
pd.DataFrame({'one':pd.Series([1,2,3], index=['a','b','c']),
'two':pd.Series([1,2,3,4],index=['a','b','c','d'])})
--- 读写文件 ---
df.read_csv('filename.csv')
df.to_csv()
In [1]: import pandas as pd
In [2]: pd.DataFrame({'one':[1,2,3,4], 'two':[5,4,3,2]})
Out[2]:
one two
0 1 5
1 2 4
a,b,c
1,2,3
2,4,6
3,6,9
In [3]: pd.DataFrame({'one':[1,2,3,4], 'two':[5,4,3,2]},
index=['a','b','c','d'])
Out[3]:
one two
a 1 5
b 2 4
c 3 3
d 4 2
In [4]: pd.DataFrame({'one':pd.Series([1,2,3], index=['a','b','c']),
'two':pd.Series([1,2,3,4],index=['a','b','c','d'])})
Out[4]:
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
In [5]: # 将两个列按标签自动对齐
In [6]: !vi test.csv
In [8]: pd.read_csv('test.csv')
Out[8]:
a b c
0 1 2 3
1 2 4 6
2 3 6 9
In [9]: df = _4
In [10]: df
Out[10]:
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
In [11]: df.to_csv('test_save.csv')
In [12]: !cat test_save.csv
,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4
In [1]: import pandas as pd
In [2]: df = pd.DataFrame({'one':pd.Series([1,2,3], index=['a','b','c']),
'two':pd.Series([1,2,3,4],index=['a','b','c','d'])})
In [3]: df.index
Out[3]: Index(['a', 'b', 'c', 'd'], dtype='object')
In [4]: df.values
Out[4]:
array([[ 1., 1.],
[ 2., 2.],
[ 3., 3.],
[ nan, 4.]])
In [5]: # 和 Series 不同 df。values 返回的是二维数组
In [6]: df
Out[6]:
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
In [7]: df.columns
Out[7]: Index(['one', 'two'], dtype='object')
In [8]: # 获取列索引
In [9]: df.T
Out[9]:
a b c d
one 1.0 2.0 3.0 NaN
two 1.0 2.0 3.0 4.0
In [10]: # 转置 行变成列, 列变成行
In [11]: # 不论在numpy中还是 pandas 中 数据列 类型全部都相同
In [12]: # 因为 NaN 是一个特殊的浮点数,所以 同列数据都被自动转成浮点数
In [13]: # 同理转置后 同列数据类型也要相同,就造成列现在的情形,所有的列都变成列
...: 浮点数
In [14]: df.describe()
.../numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
Out[14]:
one two
count 3.0 4.000000
mean 2.0 2.500000
std 1.0 1.290994
min 1.0 1.000000
25% NaN 1.750000
50% NaN 2.500000
75% NaN 3.250000
max 3.0 4.000000
In [15]:
In [15]: df['one']['a']
Out[15]: 1.0
In [16]: # 这个索引就要先选择 列 , 后选择 行。
In [17]: df.loc['a','one']
Out[17]: 1.0
In [18]: # loc 说明使用 标签索引 [ 行, 列] 这个和 numpy 就是一样的了
In [19]: df.loc['a',:]
Out[19]:
one 1.0
two 1.0
Name: a, dtype: float64
In [20]: df.loc['a',]
Out[20]:
one 1.0
two 1.0
Name: a, dtype: float64
In [21]: # 获取一行的值
In [22]: df.loc[['a','c'],]
Out[22]:
one two
a 1.0 1
c 3.0 3
In [23]: # 花式索引
In [24]: df.loc[['a','c'],'two']
Out[24]:
a 1
c 3
Name: two, dtype: int64
In [25]: # 只返回 two 的那一列
In [26]:
In [26]: df
Out[26]:
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
In [27]: df1 = pd.DataFrame({'two':[1,2,3,4], 'one':[4,5,6,7]},
index=['c','d','b','a'])
In [28]: df1
Out[28]:
one two
c 4 1
d 5 2
b 6 3
a 7 4
In [29]: df+df1
Out[29]:
one two
a 8.0 5
b 8.0 5
c 7.0 4
d NaN 6
In [30]: # 数据对齐之后再计算
In [32]: df.fillna(0)
Out[32]:
one two
a 1.0 1
b 2.0 2
c 3.0 3
d 0.0 4
In [33]: # 将 NaN 填充成0
In [34]: df2 = _26
In [35]: df2
Out[35]:
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
In [36]: df2.dropna()
Out[36]:
one two
a 1.0 1
b 2.0 2
c 3.0 3
In [37]: # dropna 若行中有一个缺失值就将一整行删除掉
In [38]: df2
Out[38]:
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
In [39]: import numpy as np
In [40]: df2.loc['d','two'] = np.nan
In [41]: df2
Out[41]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN NaN
In [42]: df2.loc['c','two'] = np.nan
In [43]: df2
Out[43]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 NaN
d NaN NaN
In [44]: df2.dropna(how='all')
Out[44]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 NaN
In [45]: # dropna 的 how 参数 设置字符串all ,意思是 当一列全部都为 NaN 的时候删除。默认的 how 参数值为 ‘any‘ 意思是只要有 NaN 的列就全部删除
In [46]: df2
Out[46]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 NaN
d NaN NaN
In [47]: df1
Out[47]:
one two
c 4 1
d 5 2
b 6 3
a 7 4
In [48]: df3 = _
In [50]: df3
Out[50]:
one two
c 4 1
d 5 2
b 6 3
a 7 4
In [51]: df3.loc['c','one']=np.nan
In [52]: df3
Out[52]:
one two
c NaN 1
d 5.0 2
b 6.0 3
a 7.0 4
In [53]: df.dropna(axis=1)
Out[53]:
Empty DataFrame
Columns: []
Index: [a, b, c, d]
In [54]: df3.dropna(axis=1)
Out[54]:
two
c 1
d 2
b 3
a 4
In [55]: df
Out[55]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 NaN
d NaN NaN
In [56]: # dropna
In [57]: # dropna 的参数 axis 默认是 0 表示行轴,设置为 1 表示为列轴
In [58]: