Python Pandas新人必备入门教程

**

Python Pandas的使用简介

**

import pandas as pd
import numpy as np

1.Series

s = pd.Series([1,3,6,np.nan,44,1])
print(s)   # 索引在左边,值在右边
print(s.values)  # 输出值
print(s.index)  # 输出索引
# 默认索引从0开始,可以自己指定索引
s2 = pd.Series([1,2,3,4],['a','b','c','d'])  # 指定索引abcd
print(s2)

结果

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64
[ 1.  3.  6. nan 44.  1.]
RangeIndex(start=0, stop=6, step=1)
a    1
b    2
c    3
d    4
dtype: int64

举例:各个城市的人口数量

sdata = {
     'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
s3 = pd.Series(sdata)
print(s3)
states = ['California','Ohio','Oregon','Texas']
s4 = pd.Series(sdata,index=states)  # 指定索引
print(s4)  # California为缺失值
print(pd.isnull(s4))  # 检验缺失值(pd.notnull()也行)
# Series对象自身及其索引都有name属性
s4.name = 'population'
s4.index.name = 'state'
print(s4)

结果

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

2.DataFrame

data1 = {
     'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada']
    ,'year':['2000','2001','2002','2003','2004','2005'],
        'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame1 = pd.DataFrame(data1)
print(frame1)
print(frame1['state'])  # 索引state一列

结果

state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2003  2.4
4  Nevada  2004  2.9
5  Nevada  2005  3.2
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object
data2 = pd.date_range('2018-08-19',periods=6)
# dates = pd.date_range(['2018-08-19','2018-08-24'])  # 起始 结束
# numpy.random.randn(d0,d1,...,dn)是从标准正态分布中返回一个或多个样本值
# numpy.random.rand(d0,d1,...,dn)的随机样本位于[0,1)中
# (6,4)表示6行4列数据
frame2 = pd.DataFrame(np.random.randn(6,4),index=data2,columns=['a','b','c','d'])
print(frame2)
# DataFrame行列皆可索引,可以被看作由Series组成的大字典
print(frame2['b'])
# 未指定行标签和列标签的数据
frame3 = pd.DataFrame(np.arange(12).reshape(3,4))
print(frame3)
# 另一种方式
frame4 = pd.DataFrame({
     'A':[1,2,3,4],
                       'B':pd.Timestamp('20180819'),
                       'C':pd.Series([1,6,9,10],dtype='float32'),
                       'D':np.array([3]*4,dtype='int32'),
                       'E':pd.Categorical(['test','train','test','train']),
                       'F':'foo'})
print(frame4)
print(frame4.index)
print(frame4.columns)
print(frame4.values)
# 数据总结
print(frame4.describe())
# 翻转数据
print(frame4.T)
print(np.transpose(frame4))
# axis=1表示行,axis=0表示列
# 默认ascending(升序)为True,反之False为到倒序
print(frame4.sort_index(axis=0,ascending=True))  # 按列升序
print(frame4.sort_index(axis=1,ascending=False))  # 按行降序
print(frame4.sort_values(by='C',ascending=False))  # 对C列降序

结果

                   a         b         c         d
2018-08-19  0.354866  0.015070 -0.627576 -0.495415
2018-08-20  2.900697  0.721100 -1.177336  2.631225
2018-08-21  2.587944  0.169646 -0.733831 -0.341553
2018-08-22  0.054560 -0.677785  1.023810 -0.290743
2018-08-23 -2.276774 -0.953618 -1.275852  0.469861
2018-08-24  1.005236 -0.088767  1.049968  1.524960
2018-08-19    0.015070
2018-08-20    0.721100
2018-08-21    0.169646
2018-08-22   -0.677785
2018-08-23   -0.953618
2018-08-24   -0.088767
Freq: D, Name: b, dtype: float64
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
   A          B     C  D      E    F
0  1 2018-08-19   1.0  3   test  foo
1  2 2018-08-19   6.0  3  train  foo
2  3 2018-08-19   9.0  3   test  foo
3  4 2018-08-19  10.0  3  train  foo
RangeIndex(start=0, stop=4, step=1)
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
[[1 Timestamp('2018-08-19 00:00:00') 1.0 3 'test' 'foo']
 [2 Timestamp('2018-08-19 00:00:00') 6.0 3 'train' 'foo']
 [3 Timestamp('2018-08-19 00:00:00') 9.0 3 'test' 'foo']
 [4 Timestamp('2018-08-19 00:00:00') 10.0 3 'train' 'foo']]
              A          C    D
count  4.000000   4.000000  4.0
mean   2.500000   6.500000  3.0
std    1.290994   4.041452  0.0
min    1.000000   1.000000  3.0
25%    1.750000   4.750000  3.0
50%    2.500000   7.500000  3.0
75%    3.250000   9.250000  3.0
max    4.000000  10.000000  3.0
                     0         ...                             3
A                    1         ...                             4
B  2018-08-19 00:00:00         ...           2018-08-19 00:00:00
C                    1         ...                            10
D                    3         ...                             3
E                 test         ...                         train
F                  foo         ...                           foo

[6 rows x 4 columns]
                     0         ...                             3
A                    1         ...                             4
B  2018-08-19 00:00:00         ...           2018-08-19 00:00:00
C                    1         ...                            10
D                    3         ...                             3
E                 test         ...                         train
F                  foo         ...                           foo

[6 rows x 4 columns]
   A          B     C  D      E    F
0  1 2018-08-19   1.0  3   test  foo
1  2 2018-08-19   6.0  3  train  foo
2  3 2018-08-19   9.0  3   test  foo
3  4 2018-08-19  10.0  3  train  foo
     F      E  D     C          B  A
0  foo   test  3   1.0 2018-08-19  1
1  foo  train  3   6.0 2018-08-19  2
2  foo   test  3   9.0 2018-08-19  3
3  foo  train  3  10.0 2018-08-19  4
   A          B     C  D      E    F
3  4 2018-08-19  10.0  3  train  foo
2  3 2018-08-19   9.0  3   test  foo
1  2 2018-08-19   6.0  3  train  foo
0  1 2018-08-19   1.0  3   test  foo

3.pandas选择数据

实战筛选数据

dates = pd.date_range('20180819',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
             A   B   C   D
2018-08-19   0   1   2   3
2018-08-20   4   5   6   7
2018-08-21   8   9  10  11
2018-08-22  12  13  14  15
2018-08-23  16  17  18  19
2018-08-24  20  21  22  23
# 检索A列
print(df['A'])  # df.A也可以
# 选择跨越多行或多列
# 选取前三行
print(df[0:3])
2018-08-19     0
2018-08-20     4
2018-08-21     8
2018-08-22    12
2018-08-23    16
2018-08-24    20
Freq: D, Name: A, dtype: int32
            A  B   C   D
2018-08-19  0  1   2   3
2018-08-20  4  5   6   7
2018-08-21  8  9  10  11
# 根据标签选择数据
# 获取特定行或列
# 指定行数据
print(df.loc['20180819'])
# 指定列数据
print(df.loc[:,'A':'C'])
# 行列同时检索
print(df.loc['20180819',['A','B']])
A    0
B    1
C    2
D    3
Name: 2018-08-19 00:00:00, dtype: int32
             A   B   C
2018-08-19   0   1   2
2018-08-20   4   5   6
2018-08-21   8   9  10
2018-08-22  12  13  14
2018-08-23  16  17  18
2018-08-24  20  21  22
A    0
B    1
Name: 2018-08-19 00:00:00, dtype: int32
# 根据序列iloc获取特点位置的值
print(df.iloc[3,1])
print(df.iloc[3:5,1:3])
# 跨行操作
print(df.iloc[[1,3,5],1:3])
# 混合选择
print(df.iloc[:3,[0,2]])
# 通过判断的筛选
print(df[df.A>8])
13
             B   C
2018-08-22  13  14
2018-08-23  17  18
             B   C
2018-08-20   5   6
2018-08-22  13  14
2018-08-24  21  22
            A   C
2018-08-19  0   2
2018-08-20  4   6
2018-08-21  8  10
             A   B   C   D
2018-08-22  12  13  14  15
2018-08-23  16  17  18  19
2018-08-24  20  21  22  23

4.pandas设置值

# 创建数据
dates2 = pd.date_range('20180819',periods=6)
df2 = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates2,columns=['A','B','C','D'])
print(df2)
# 根据数据设置loc和iloc
df2.iloc[2,2] = 111
df2.loc['20180820','B'] = 2222
print(df2)
# 根据条件设置
# 更改B的数,而更改的位置取决于4的位置,并设相应位置的数为0
df2.B[df2.A>4] = 0
print(df2)
# 按行或列设置
# 列批处理,F列改为NaN
df2['F'] = np.nan
print(df2)
# 添加Series序列(长度必须对齐)
df2['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20180820',periods=6))
print(df2)
# 设定某行某列为特定值
df2.loc['20180820','A'] = 67   # df2.iloc[0,0] = 67
print(df2)
# 修改一整行数据
df2.iloc[1] = np.nan  # df.iloc[1,:] = np.nan
print(df2)

结果

             A   B   C   D
2018-08-19   0   1   2   3
2018-08-20   4   5   6   7
2018-08-21   8   9  10  11
2018-08-22  12  13  14  15
2018-08-23  16  17  18  19
2018-08-24  20  21  22  23
             A     B    C   D
2018-08-19   0     1    2   3
2018-08-20   4  2222    6   7
2018-08-21   8     9  111  11
2018-08-22  12    13   14  15
2018-08-23  16    17   18  19
2018-08-24  20    21   22  23
             A     B    C   D
2018-08-19   0     1    2   3
2018-08-20   4  2222    6   7
2018-08-21   8     0  111  11
2018-08-22  12     0   14  15
2018-08-23  16     0   18  19
2018-08-24  20     0   22  23
             A     B    C   D   F
2018-08-19   0     1    2   3 NaN
2018-08-20   4  2222    6   7 NaN
2018-08-21   8     0  111  11 NaN
2018-08-22  12     0   14  15 NaN
2018-08-23  16     0   18  19 NaN
2018-08-24  20     0   22  23 NaN
             A     B    C   D   F    E
2018-08-19   0     1    2   3 NaN  NaN
2018-08-20   4  2222    6   7 NaN  1.0
2018-08-21   8     0  111  11 NaN  2.0
2018-08-22  12     0   14  15 NaN  3.0
2018-08-23  16     0   18  19 NaN  4.0
2018-08-24  20     0   22  23 NaN  5.0
             A     B    C   D   F    E
2018-08-19   0     1    2   3 NaN  NaN
2018-08-20  67  2222    6   7 NaN  1.0
2018-08-21   8     0  111  11 NaN  2.0
2018-08-22  12     0   14  15 NaN  3.0
2018-08-23  16     0   18  19 NaN  4.0
2018-08-24  20     0   22  23 NaN  5.0
               A    B      C     D   F    E
2018-08-19   0.0  1.0    2.0   3.0 NaN  NaN
2018-08-20   NaN  NaN    NaN   NaN NaN  NaN
2018-08-21   8.0  0.0  111.0  11.0 NaN  2.0
2018-08-22  12.0  0.0   14.0  15.0 NaN  3.0
2018-08-23  16.0  0.0   18.0  19.0 NaN  4.0
2018-08-24  20.0  0.0   22.0  23.0 NaN  5.0

Process finished with exit code 0

5.pandas处理丢失数据

# 创建含NaN的矩阵
dates3 = pd.date_range('20180819',periods=6)
df3 = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates3,columns=['A','B','C','D'])
print(df3)
df3.iloc[0,1] = np.nan
df3.iloc[1,2] = np.nan
print(df3)
             A   B   C   D
2018-08-19   0   1   2   3
2018-08-20   4   5   6   7
2018-08-21   8   9  10  11
2018-08-22  12  13  14  15
2018-08-23  16  17  18  19
2018-08-24  20  21  22  23
             A     B     C   D
2018-08-19   0   NaN   2.0   3
2018-08-20   4   5.0   NaN   7
2018-08-21   8   9.0  10.0  11
2018-08-22  12  13.0  14.0  15
2018-08-23  16  17.0  18.0  19
2018-08-24  20  21.0  22.0  23
# 删掉含有NaN的行或列
print(df3.dropna())  # 默认是删掉含有NaN的行
print(df3.dropna(
    axis=0,  # 0对行进行操作;1对列进行操作
    how='any'  # 'any':只要存在NaN就drop掉;'all':必须全部是NaN才drop
))
             A     B     C   D
2018-08-21   8   9.0  10.0  11
2018-08-22  12  13.0  14.0  15
2018-08-23  16  17.0  18.0  19
2018-08-24  20  21.0  22.0  23
             A     B     C   D
2018-08-21   8   9.0  10.0  11
2018-08-22  12  13.0  14.0  15
2018-08-23  16  17.0  18.0  19
2018-08-24  20  21.0  22.0  23
# 删掉含有NaN的列
print(df3.dropna(
    axis=1,how='any'
))
             A   D
2018-08-19   0   3
2018-08-20   4   7
2018-08-21   8  11
2018-08-22  12  15
2018-08-23  16  19
2018-08-24  20  23
# 替换NaN值为0或其他
print(df3.fillna(value=0))
            A     B     C   D
2018-08-19   0   0.0   2.0   3
2018-08-20   4   5.0   0.0   7
2018-08-21   8   9.0  10.0  11
2018-08-22  12  13.0  14.0  15
2018-08-23  16  17.0  18.0  19
2018-08-24  20  21.0  22.0  23
# 是否为空
print(df3.isnull())
                A      B      C      D
2018-08-19  False   True  False  False
2018-08-20  False  False   True  False
2018-08-21  False  False  False  False
2018-08-22  False  False  False  False
2018-08-23  False  False  False  False
2018-08-24  False  False  False  False
# 是否为NaN
print(df3.isna())
               A      B      C      D
2018-08-19  False   True  False  False
2018-08-20  False  False   True  False
2018-08-21  False  False  False  False
2018-08-22  False  False  False  False
2018-08-23  False  False  False  False
2018-08-24  False  False  False  False
# 检测某列是否有缺失数据NaN
print(df3.isnull().any())
A    False
B     True
C     True
D    False
dtype: bool
# 检测数据是否存在NaN,如果存在返回True
print(np.any(df3.isnull()==True))
True

后续再更新

你可能感兴趣的:(Python,pandas,人工智能,python)