>>> #--------------------2018/08/09--------------------
>>> #---------------- pandas教程[1]DataFrame入门--------
>>> import numpy as np
>>> import pandas as pd
d
>>> dates = pd.date_range('20140729',periods=6)
>>> dates
DatetimeIndex(['2014-07-29', '2014-07-30', '2014-07-31', '2014-08-01',
'2014-08-02', '2014-08-03'],
dtype='datetime64[ns]', freq='D')
>>> df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
>>> df
A B C D
2014-07-29 1.338334 0.519005 0.624861 0.842467
2014-07-30 0.601006 1.424606 0.162684 -0.130496
2014-07-31 1.222067 0.255025 0.942523 0.583058
2014-08-01 1.312611 0.271335 -0.015009 0.398183
2014-08-02 0.031093 -1.362878 -0.234253 -0.774762
2014-08-03 -0.649002 -1.055498 1.195584 0.066292
>>> #使用字典来创建数据框,例如创建一个列名为A的数据框,索引是自动创建的整数
>>> df2=pd.DataFrame({'A':np.random.randn(6),})
>>> df2
A
0 -1.286018
1 0.214674
2 0.234867
3 -0.367041
4 0.063977
5 0.366060
>>> #randn函数用于创建随机数
>>> df3=pd.
SyntaxError: invalid syntax
>>> df3=pd.DataFrame({'A':pd.Timestamp('20140729'),'B':pd.Series(1),})
>>> df3
A B
0 2014-07-29 1
>>> #假如字典内的数据长度不同,以最长的数据为准,比如B列有4行
>>> df4=pd.DataFrame({'A':pd.Timestamp('20140729'),'B':pd.Series(1,index=list(range(4))),})
>>> df4
A B
0 2014-07-29 1
1 2014-07-29 1
2 2014-07-29 1
3 2014-07-29 1
>>> df4.dtypes
A datetime64[ns]
B int64
dtype: object
>>> #使用dtypes来查看各行的数据格式
>>> #使用head查看前几行数据(默认是前5行),不过你可以指定前几行
>>> df.head()
A B C D
2014-07-29 1.338334 0.519005 0.624861 0.842467
2014-07-30 0.601006 1.424606 0.162684 -0.130496
2014-07-31 1.222067 0.255025 0.942523 0.583058
2014-08-01 1.312611 0.271335 -0.015009 0.398183
2014-08-02 0.031093 -1.362878 -0.234253 -0.774762
>>> df.head(3)
A B C D
2014-07-29 1.338334 0.519005 0.624861 0.842467
2014-07-30 0.601006 1.424606 0.162684 -0.130496
2014-07-31 1.222067 0.255025 0.942523 0.583058
>>> #使用tail查看后5行数据
>>> df.tail(2)
A B C D
2014-08-02 0.031093 -1.362878 -0.234253 -0.774762
2014-08-03 -0.649002 -1.055498 1.195584 0.066292
>>> df.tail(5)
A B C D
2014-07-30 0.601006 1.424606 0.162684 -0.130496
2014-07-31 1.222067 0.255025 0.942523 0.583058
2014-08-01 1.312611 0.271335 -0.015009 0.398183
2014-08-02 0.031093 -1.362878 -0.234253 -0.774762
2014-08-03 -0.649002 -1.055498 1.195584 0.066292
>>> #查看数据框的索引
>>> df.index
DatetimeIndex(['2014-07-29', '2014-07-30', '2014-07-31', '2014-08-01',
'2014-08-02', '2014-08-03'],
dtype='datetime64[ns]', freq='D')
>>> #查看列名用columns
>>> df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
>>> #查看数据值,用values
>>> df.values
array([[ 1.33833432, 0.51900504, 0.62486112, 0.84246746],
[ 0.60100588, 1.42460625, 0.16268358, -0.13049649],
[ 1.2220671 , 0.25502473, 0.94252298, 0.58305822],
[ 1.31261091, 0.27133471, -0.01500863, 0.39818312],
[ 0.03109322, -1.36287849, -0.23425266, -0.77476246],
[-0.64900233, -1.05549823, 1.19558433, 0.06629241]])
>>> #查看描述性统计,用describe
>>> df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.642685 0.008599 0.446065 0.164124
std 0.813959 1.040058 0.564968 0.577636
min -0.649002 -1.362878 -0.234253 -0.774762
25% 0.173571 -0.727867 0.029414 -0.081299
50% 0.911536 0.263180 0.393772 0.232238
75% 1.289975 0.457087 0.863108 0.536839
max 1.338334 1.424606 1.195584 0.842467
>>> #使用type看一下输出的描述性统计是什么样的数据类型——DataFrame数据
>>> type(df.describe())
>>> #使用T来转置数据,也就是行列转换
>>> df.T
2014-07-29 2014-07-30 ... 2014-08-02 2014-08-03
A 1.338334 0.601006 ... 0.031093 -0.649002
B 0.519005 1.424606 ... -1.362878 -1.055498
C 0.624861 0.162684 ... -0.234253 1.195584
D 0.842467 -0.130496 ... -0.774762 0.066292
[4 rows x 6 columns]
>>> df.T
2014-07-29 2014-07-30 ... 2014-08-02 2014-08-03
A 1.338334 0.601006 ... 0.031093 -0.649002
B 0.519005 1.424606 ... -1.362878 -1.055498
C 0.624861 0.162684 ... -0.234253 1.195584
D 0.842467 -0.130496 ... -0.774762 0.066292
[4 rows x 6 columns]
>>> #对数据进行排序,用到了sort,参数可以指定根据哪一列数据进行排序。
>>> df.sort(columns='C')
Traceback (most recent call last):
File "", line 1, in
df.sort(columns='C')
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\generic.py", line 4376, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'sort'
>>> df.sort_values(by='C')
A B C D
2014-08-02 0.031093 -1.362878 -0.234253 -0.774762
2014-08-01 1.312611 0.271335 -0.015009 0.398183
2014-07-30 0.601006 1.424606 0.162684 -0.130496
2014-07-29 1.338334 0.519005 0.624861 0.842467
2014-07-31 1.222067 0.255025 0.942523 0.583058
2014-08-03 -0.649002 -1.055498 1.195584 0.066292
>>> #------------ pandas教程[2]DataFrame选择数据-1--------------------
>>> #上一篇文章介绍了如何创建和查看DataFrame数据,这篇文章讲一下如何选择DataFrame中的数据,还是用例子来说明问题。
>>> df
A B C D
2014-07-29 1.338334 0.519005 0.624861 0.842467
2014-07-30 0.601006 1.424606 0.162684 -0.130496
2014-07-31 1.222067 0.255025 0.942523 0.583058
2014-08-01 1.312611 0.271335 -0.015009 0.398183
2014-08-02 0.031093 -1.362878 -0.234253 -0.774762
2014-08-03 -0.649002 -1.055498 1.195584 0.066292
>>> #假如我们要选择A列的数据进行操作:df['a']
>>> df['A']
2014-07-29 1.338334
2014-07-30 0.601006
2014-07-31 1.222067
2014-08-01 1.312611
2014-08-02 0.031093
2014-08-03 -0.649002
Freq: D, Name: A, dtype: float64
>>> #还可以使用数组的切片操作,但是注意了,切片得到的是行数据
>>> df[1:3]
A B C D
2014-07-30 0.601006 1.424606 0.162684 -0.130496
2014-07-31 1.222067 0.255025 0.942523 0.583058
>>> #如果你想使用这个方法得到列,那就会出现错误
>>> df['A';'A']
SyntaxError: invalid syntax
>>> df['A';'B']
SyntaxError: invalid syntax
>>> #我们还可以使用行标签来指定输出的行
>>> df['20140729':'20140730']
A B C D
2014-07-29 1.338334 0.519005 0.624861 0.842467
2014-07-30 0.601006 1.424606 0.162684 -0.130496
>>> #DataFrame的loc方法是帮助选择数据的,比如选择索引位置为0的一行数据(注意我们是用dates作为索引的)
>>> df.loc[dates(0)]
Traceback (most recent call last):
File "", line 1, in
df.loc[dates(0)]
TypeError: 'DatetimeIndex' object is not callable
>>> df.loc[dates[0]]
A 1.338334
B 0.519005
C 0.624861
D 0.842467
Name: 2014-07-29 00:00:00, dtype: float64
>>>
>>> #选择多列数据的写法
>>> df.loc[:,['A','B']]
A B
2014-07-29 1.338334 0.519005
2014-07-30 0.601006 1.424606
2014-07-31 1.222067 0.255025
2014-08-01 1.312611 0.271335
2014-08-02 0.031093 -1.362878
2014-08-03 -0.649002 -1.055498
>>> #假如我们要选择的是一个局部数据,是行和列的交叉区域
>>> df.loc['20140729':'20140730',['A','B']]
A B
2014-07-29 1.338334 0.519005
2014-07-30 0.601006 1.424606
>>> #假如我们只选择某一个数据,可以指定行和列:
>>> df.loc[dates[0],'A']
1.338334322857124
>>> #当然,at方法是专门用于获取某个值的:
>>> df.at[dates[0],'A']
1.338334322857124
>>> #选择数据就是用到了切片和loc、at方法,下一篇文章介绍一下iloc方法选择数据,它使你像操作array一样操作DataFrame
练习: http://jingyan.baidu.com/season/43456?pn=0