pandas教程[1]DataFrame入门 | pandas教程[2]DataFrame选择数据-1

>>> #--------------------2018/08/09--------------------
>>> #---------------- pandas教程[1]DataFrame入门--------
>>> import numpy as np
>>> import pandas as pd
d
>>> dates = pd.date_range('20140729',periods=6)
>>> dates
DatetimeIndex(['2014-07-29', '2014-07-30', '2014-07-31', '2014-08-01',
               '2014-08-02', '2014-08-03'],
              dtype='datetime64[ns]', freq='D')
>>> df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
>>> df
                   A         B         C         D
2014-07-29  1.338334  0.519005  0.624861  0.842467
2014-07-30  0.601006  1.424606  0.162684 -0.130496
2014-07-31  1.222067  0.255025  0.942523  0.583058
2014-08-01  1.312611  0.271335 -0.015009  0.398183
2014-08-02  0.031093 -1.362878 -0.234253 -0.774762
2014-08-03 -0.649002 -1.055498  1.195584  0.066292
>>> #使用字典来创建数据框,例如创建一个列名为A的数据框,索引是自动创建的整数
>>> df2=pd.DataFrame({'A':np.random.randn(6),})
>>> df2
          A
0 -1.286018
1  0.214674
2  0.234867
3 -0.367041
4  0.063977
5  0.366060
>>> #randn函数用于创建随机数
>>> df3=pd.
SyntaxError: invalid syntax
>>> df3=pd.DataFrame({'A':pd.Timestamp('20140729'),'B':pd.Series(1),})
>>> df3
           A  B
0 2014-07-29  1
>>> #假如字典内的数据长度不同,以最长的数据为准,比如B列有4行
>>> df4=pd.DataFrame({'A':pd.Timestamp('20140729'),'B':pd.Series(1,index=list(range(4))),})
>>> df4
           A  B
0 2014-07-29  1
1 2014-07-29  1
2 2014-07-29  1
3 2014-07-29  1
>>> df4.dtypes
A    datetime64[ns]
B             int64
dtype: object
>>> #使用dtypes来查看各行的数据格式
>>> #使用head查看前几行数据(默认是前5行),不过你可以指定前几行
>>> df.head()
                   A         B         C         D
2014-07-29  1.338334  0.519005  0.624861  0.842467
2014-07-30  0.601006  1.424606  0.162684 -0.130496
2014-07-31  1.222067  0.255025  0.942523  0.583058
2014-08-01  1.312611  0.271335 -0.015009  0.398183
2014-08-02  0.031093 -1.362878 -0.234253 -0.774762
>>> df.head(3)
                   A         B         C         D
2014-07-29  1.338334  0.519005  0.624861  0.842467
2014-07-30  0.601006  1.424606  0.162684 -0.130496
2014-07-31  1.222067  0.255025  0.942523  0.583058
>>> #使用tail查看后5行数据
>>> df.tail(2)
                   A         B         C         D
2014-08-02  0.031093 -1.362878 -0.234253 -0.774762
2014-08-03 -0.649002 -1.055498  1.195584  0.066292
>>> df.tail(5)
                   A         B         C         D
2014-07-30  0.601006  1.424606  0.162684 -0.130496
2014-07-31  1.222067  0.255025  0.942523  0.583058
2014-08-01  1.312611  0.271335 -0.015009  0.398183
2014-08-02  0.031093 -1.362878 -0.234253 -0.774762
2014-08-03 -0.649002 -1.055498  1.195584  0.066292
>>> #查看数据框的索引
>>> df.index
DatetimeIndex(['2014-07-29', '2014-07-30', '2014-07-31', '2014-08-01',
               '2014-08-02', '2014-08-03'],
              dtype='datetime64[ns]', freq='D')
>>> #查看列名用columns
>>> df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
>>> #查看数据值,用values
>>> df.values
array([[ 1.33833432,  0.51900504,  0.62486112,  0.84246746],
       [ 0.60100588,  1.42460625,  0.16268358, -0.13049649],
       [ 1.2220671 ,  0.25502473,  0.94252298,  0.58305822],
       [ 1.31261091,  0.27133471, -0.01500863,  0.39818312],
       [ 0.03109322, -1.36287849, -0.23425266, -0.77476246],
       [-0.64900233, -1.05549823,  1.19558433,  0.06629241]])
>>> #查看描述性统计,用describe

>>> df.describe()
              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.642685  0.008599  0.446065  0.164124
std    0.813959  1.040058  0.564968  0.577636
min   -0.649002 -1.362878 -0.234253 -0.774762
25%    0.173571 -0.727867  0.029414 -0.081299
50%    0.911536  0.263180  0.393772  0.232238
75%    1.289975  0.457087  0.863108  0.536839
max    1.338334  1.424606  1.195584  0.842467
>>> #使用type看一下输出的描述性统计是什么样的数据类型——DataFrame数据
>>> type(df.describe())

>>> #使用T来转置数据,也就是行列转换
>>> df.T
   2014-07-29  2014-07-30     ...      2014-08-02  2014-08-03
A    1.338334    0.601006     ...        0.031093   -0.649002
B    0.519005    1.424606     ...       -1.362878   -1.055498
C    0.624861    0.162684     ...       -0.234253    1.195584
D    0.842467   -0.130496     ...       -0.774762    0.066292

[4 rows x 6 columns]
>>> df.T
   2014-07-29  2014-07-30     ...      2014-08-02  2014-08-03
A    1.338334    0.601006     ...        0.031093   -0.649002
B    0.519005    1.424606     ...       -1.362878   -1.055498
C    0.624861    0.162684     ...       -0.234253    1.195584
D    0.842467   -0.130496     ...       -0.774762    0.066292

[4 rows x 6 columns]
>>> #对数据进行排序,用到了sort,参数可以指定根据哪一列数据进行排序。
>>> df.sort(columns='C')
Traceback (most recent call last):
  File "", line 1, in 
    df.sort(columns='C')
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\generic.py", line 4376, in __getattr__
    return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'sort'
>>> df.sort_values(by='C')
                   A         B         C         D
2014-08-02  0.031093 -1.362878 -0.234253 -0.774762
2014-08-01  1.312611  0.271335 -0.015009  0.398183
2014-07-30  0.601006  1.424606  0.162684 -0.130496
2014-07-29  1.338334  0.519005  0.624861  0.842467
2014-07-31  1.222067  0.255025  0.942523  0.583058
2014-08-03 -0.649002 -1.055498  1.195584  0.066292
>>> #------------ pandas教程[2]DataFrame选择数据-1--------------------
>>> #上一篇文章介绍了如何创建和查看DataFrame数据,这篇文章讲一下如何选择DataFrame中的数据,还是用例子来说明问题。
>>> df
                   A         B         C         D
2014-07-29  1.338334  0.519005  0.624861  0.842467
2014-07-30  0.601006  1.424606  0.162684 -0.130496
2014-07-31  1.222067  0.255025  0.942523  0.583058
2014-08-01  1.312611  0.271335 -0.015009  0.398183
2014-08-02  0.031093 -1.362878 -0.234253 -0.774762
2014-08-03 -0.649002 -1.055498  1.195584  0.066292
>>> #假如我们要选择A列的数据进行操作:df['a']
>>> df['A']
2014-07-29    1.338334
2014-07-30    0.601006
2014-07-31    1.222067
2014-08-01    1.312611
2014-08-02    0.031093
2014-08-03   -0.649002
Freq: D, Name: A, dtype: float64
>>> #还可以使用数组的切片操作,但是注意了,切片得到的是行数据
>>> df[1:3]
                   A         B         C         D
2014-07-30  0.601006  1.424606  0.162684 -0.130496
2014-07-31  1.222067  0.255025  0.942523  0.583058
>>> #如果你想使用这个方法得到列,那就会出现错误
>>> df['A';'A']
SyntaxError: invalid syntax
>>> df['A';'B']
SyntaxError: invalid syntax
>>> #我们还可以使用行标签来指定输出的行
>>> df['20140729':'20140730']
                   A         B         C         D
2014-07-29  1.338334  0.519005  0.624861  0.842467
2014-07-30  0.601006  1.424606  0.162684 -0.130496
>>> #DataFrame的loc方法是帮助选择数据的,比如选择索引位置为0的一行数据(注意我们是用dates作为索引的)
>>> df.loc[dates(0)]
Traceback (most recent call last):
  File "", line 1, in 
    df.loc[dates(0)]
TypeError: 'DatetimeIndex' object is not callable
>>> df.loc[dates[0]]
A    1.338334
B    0.519005
C    0.624861
D    0.842467
Name: 2014-07-29 00:00:00, dtype: float64
>>> 
>>> #选择多列数据的写法
>>> df.loc[:,['A','B']]
                   A         B
2014-07-29  1.338334  0.519005
2014-07-30  0.601006  1.424606
2014-07-31  1.222067  0.255025
2014-08-01  1.312611  0.271335
2014-08-02  0.031093 -1.362878
2014-08-03 -0.649002 -1.055498
>>> #假如我们要选择的是一个局部数据,是行和列的交叉区域
>>> df.loc['20140729':'20140730',['A','B']]
                   A         B
2014-07-29  1.338334  0.519005
2014-07-30  0.601006  1.424606
>>> #假如我们只选择某一个数据,可以指定行和列:
>>> df.loc[dates[0],'A']
1.338334322857124
>>> #当然,at方法是专门用于获取某个值的:
>>> df.at[dates[0],'A']
1.338334322857124
>>> #选择数据就是用到了切片和loc、at方法,下一篇文章介绍一下iloc方法选择数据,它使你像操作array一样操作DataFrame

练习: http://jingyan.baidu.com/season/43456?pn=0

 

你可能感兴趣的:(pandas教程[1]DataFrame入门 | pandas教程[2]DataFrame选择数据-1)