案例1
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("------------------df['A']--------------------")
print(df['A'])
print("-------------使用单列值选择数据---------------")
print(df[df['A'] > 0])
输出结果为:
A B C D
2013-01-01 -0.108862 -0.669454 0.755285 -0.257934
2013-01-02 -1.704769 1.207508 -0.379307 0.360502
2013-01-03 0.790387 0.644891 0.673837 -1.351891
2013-01-04 0.058280 0.759115 0.224668 -0.977974
2013-01-05 -1.616106 2.253661 0.426269 -0.266609
2013-01-06 -1.673112 0.425053 -0.395498 -1.099882
------------------df['A']--------------------
2013-01-01 -0.108862
2013-01-02 -1.704769
2013-01-03 0.790387
2013-01-04 0.058280
2013-01-05 -1.616106
2013-01-06 -1.673112
Freq: D, Name: A, dtype: float64
-------------使用单列值选择数据---------------
A B C D
2013-01-03 0.790387 0.644891 0.673837 -1.351891
2013-01-04 0.058280 0.759115 0.224668 -0.977974
案例2
从一个DataFrame中选择符合布尔条件的的值
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("-----------Using a single column’s values to select data:-----------")
print(df[df > 0])
输出结果为:
A B C D
2013-01-01 -0.295841 -1.592536 -0.089252 -1.723649
2013-01-02 0.948812 -1.883173 -0.060775 0.448133
2013-01-03 0.291922 -0.182995 -0.831074 1.303921
2013-01-04 0.342257 0.667672 0.574162 -1.953030
2013-01-05 -0.742187 0.098960 -0.646819 0.960099
2013-01-06 -0.204221 -0.223569 -0.220845 -1.146068
-----------Using a single column’s values to select data:-----------
A B C D
2013-01-01 NaN NaN NaN NaN
2013-01-02 0.948812 NaN NaN 0.448133
2013-01-03 0.291922 NaN NaN 1.303921
2013-01-04 0.342257 0.667672 0.574162 NaN
2013-01-05 NaN 0.098960 NaN 0.960099
2013-01-06 NaN NaN NaN NaN
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
df2 = df.copy()
print("---------df.copy()的结果为:--------------")
print(df2)
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
print("---------增加一列之后的值:----------------")
print(df2)
print("---------使用isin()方法之后的值:----------")
print(df2[df2['E'].isin(['two','four'])])
输出额结果为:
A B C D
2013-01-01 1.248590 -0.197050 1.147376 0.812086
2013-01-02 -0.283422 0.400051 1.307791 0.493767
2013-01-03 0.134394 0.250902 -0.004353 -1.563162
2013-01-04 -0.259653 -1.065737 -0.468399 1.258488
2013-01-05 1.124381 -1.116601 -0.302692 0.631385
2013-01-06 0.361942 0.227282 1.656400 -0.526585
---------df.copy()的结果为:--------------
A B C D
2013-01-01 1.248590 -0.197050 1.147376 0.812086
2013-01-02 -0.283422 0.400051 1.307791 0.493767
2013-01-03 0.134394 0.250902 -0.004353 -1.563162
2013-01-04 -0.259653 -1.065737 -0.468399 1.258488
2013-01-05 1.124381 -1.116601 -0.302692 0.631385
2013-01-06 0.361942 0.227282 1.656400 -0.526585
---------增加一列之后的值:----------------
A B C D E
2013-01-01 1.248590 -0.197050 1.147376 0.812086 one
2013-01-02 -0.283422 0.400051 1.307791 0.493767 one
2013-01-03 0.134394 0.250902 -0.004353 -1.563162 two
2013-01-04 -0.259653 -1.065737 -0.468399 1.258488 three
2013-01-05 1.124381 -1.116601 -0.302692 0.631385 four
2013-01-06 0.361942 0.227282 1.656400 -0.526585 three
---------使用isin()方法之后的值:----------
A B C D E
2013-01-03 0.134394 0.250902 -0.004353 -1.563162 two
2013-01-05 1.124381 -1.116601 -0.302692 0.631385 four
通过时间的indexes来自动创建新列
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("--------------通过pd.Series来进行创建:--------------")
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
print(s1)
print("--------------设置一列新值:-------------------------")
df['F'] = s1
print(df)
print("--------------- df.at[dates[0], 'A'] = 0后----------------------")
df.at[dates[0],'A'] = 0
print(df)
print("-----------------df.iat[0,1] = 0-------------------")
df.iat[0,1] = 0
print(df)
输出结果为:
A B C D
2013-01-01 -0.418346 -0.184602 -0.570495 -0.590981
2013-01-02 -0.582694 1.605383 -1.430993 -1.058385
2013-01-03 -0.324931 -0.720275 1.075161 0.417305
2013-01-04 0.152029 0.108510 0.011692 -1.424354
2013-01-05 -2.814298 0.041146 0.177329 1.358470
2013-01-06 -0.652250 0.068495 1.016900 0.864277
--------------通过pd.Series来进行创建:--------------
2013-01-01 1
2013-01-02 2
2013-01-03 3
2013-01-04 4
2013-01-05 5
2013-01-06 6
Freq: D, dtype: int64
--------------设置一列新值:-------------------------
A B C D F
2013-01-01 -0.418346 -0.184602 -0.570495 -0.590981 1
2013-01-02 -0.582694 1.605383 -1.430993 -1.058385 2
2013-01-03 -0.324931 -0.720275 1.075161 0.417305 3
2013-01-04 0.152029 0.108510 0.011692 -1.424354 4
2013-01-05 -2.814298 0.041146 0.177329 1.358470 5
2013-01-06 -0.652250 0.068495 1.016900 0.864277 6
--------------- df.at[dates[0], 'A'] = 0后----------------------
A B C D F
2013-01-01 0.000000 -0.184602 -0.570495 -0.590981 1
2013-01-02 -0.582694 1.605383 -1.430993 -1.058385 2
2013-01-03 -0.324931 -0.720275 1.075161 0.417305 3
2013-01-04 0.152029 0.108510 0.011692 -1.424354 4
2013-01-05 -2.814298 0.041146 0.177329 1.358470 5
2013-01-06 -0.652250 0.068495 1.016900 0.864277 6
-----------------df.iat[0,1] = 0-------------------
A B C D F
2013-01-01 0.000000 0.000000 -0.570495 -0.590981 1
2013-01-02 -0.582694 1.605383 -1.430993 -1.058385 2
2013-01-03 -0.324931 -0.720275 1.075161 0.417305 3
2013-01-04 0.152029 0.108510 0.011692 -1.424354 4
2013-01-05 -2.814298 0.041146 0.177329 1.358470 5
2013-01-06 -0.652250 0.068495 1.016900 0.864277 6
案例
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
# print(df)
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
df['F'] = s1
df.at[dates[0],'A'] = 0
df.iat[0,1] = 0
print("--------插入一列值-------------------------------------------------------")
df.loc[:,'D'] = np.array([5] * len(df))
print(df)
print("---------带有where条件的设置值(将会将所有的正值变成它的相反数)--------------")
df2 = df.copy()
df2[df2 > 0] = -df2
print(df2)
输出结果为:
---------设置前的值:----------
--------插入一列值-------------------------------------------------
A B C D F
2013-01-01 0.000000 0.000000 -0.539118 5 1
2013-01-02 -0.093590 -0.631473 0.986206 5 2
2013-01-03 -1.409640 -0.043280 -1.178707 5 3
2013-01-04 1.308376 1.979588 0.927735 5 4
2013-01-05 0.368429 1.722930 -0.075515 5 5
2013-01-06 1.176985 0.867951 0.198001 5 6
---------带有where条件的设置值(将会将所有的正值变成它的相反数)--------------
A B C D F
2013-01-01 0.000000 0.000000 -0.539118 -5 -1
2013-01-02 -0.093590 -0.631473 -0.986206 -5 -2
2013-01-03 -1.409640 -0.043280 -1.178707 -5 -3
2013-01-04 -1.308376 -1.979588 -0.927735 -5 -4
2013-01-05 -0.368429 -1.722930 -0.075515 -5 -5
2013-01-06 -1.176985 -0.867951 -0.198001 -5 -6
pandas首先使用np.nan代表缺失值,在默认情况下不包括在计算中。
Reindexing允许你去"改变"/“添加”/"删除"指定轴上的index.并返回一份数据的备份。
reindex
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print(df.columns)
print("---------行数据使用[0:4]之间的数据,列用df.columns,然后加个E的列:-------------------")
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
print(df1)
print("---------修改0、1列,将1赋值给0和1列,其它值为原来值:-------------------")
df1.loc[dates[0]:dates[1],'E'] = 1
print(df1)
输出结果为:
A B C D
2013-01-01 -2.291562 -0.519261 -0.454256 0.645428
2013-01-02 0.279289 -0.282660 1.309209 -0.040633
2013-01-03 1.966930 -2.506430 1.069297 -0.267049
2013-01-04 0.411605 -1.179189 0.241829 -0.955059
2013-01-05 -1.079194 0.294480 0.225865 0.144425
2013-01-06 1.269804 -0.666968 -0.181081 -0.162801
Index(['A', 'B', 'C', 'D'], dtype='object')
---------行数据使用[0:4]之间的数据,列用df.columns,然后加个E的列:-------------------
A B C D E
2013-01-01 -2.291562 -0.519261 -0.454256 0.645428 NaN
2013-01-02 0.279289 -0.282660 1.309209 -0.040633 NaN
2013-01-03 1.966930 -2.506430 1.069297 -0.267049 NaN
2013-01-04 0.411605 -1.179189 0.241829 -0.955059 NaN
---------修改0、1列,将1赋值给0和1列,其它值为原来值:-------------------
A B C D E
2013-01-01 -2.291562 -0.519261 -0.454256 0.645428 1.0
2013-01-02 0.279289 -0.282660 1.309209 -0.040633 1.0
2013-01-03 1.966930 -2.506430 1.069297 -0.267049 NaN
2013-01-04 0.411605 -1.179189 0.241829 -0.955059 NaN
dropna
去除所有含有NaN的行
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print(df.columns)
print("---------行数据使用[0:4]之间的数据,列用df.columns,然后加个E的列:-------------------")
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
print(df1)
print("---------修改0、1列,将1赋值给0和1列,其它值为原来值:-------------------")
df1.loc[dates[0]:dates[1],'E'] = 1
print(df1)
print("---------删除所有的含有NaN的行:-------------------")
print(df1.dropna(how='any'))
输出结果为:
A B C D
2013-01-01 0.866323 -1.852962 -0.283425 -0.914182
2013-01-02 1.104424 -1.174742 -0.185782 -0.771816
2013-01-03 -0.391907 -0.612250 0.512120 -0.470910
2013-01-04 0.435786 -0.556628 -0.962486 1.964813
2013-01-05 -0.455669 -0.428983 0.671128 -1.078615
2013-01-06 0.313603 0.495796 0.497048 0.176380
Index(['A', 'B', 'C', 'D'], dtype='object')
---------行数据使用[0:4]之间的数据,列用df.columns,然后加个E的列:-------------------
A B C D E
2013-01-01 0.866323 -1.852962 -0.283425 -0.914182 NaN
2013-01-02 1.104424 -1.174742 -0.185782 -0.771816 NaN
2013-01-03 -0.391907 -0.612250 0.512120 -0.470910 NaN
2013-01-04 0.435786 -0.556628 -0.962486 1.964813 NaN
---------修改0、1列,将1赋值给0和1列,其它值为原来值:-------------------
A B C D E
2013-01-01 0.866323 -1.852962 -0.283425 -0.914182 1.0
2013-01-02 1.104424 -1.174742 -0.185782 -0.771816 1.0
2013-01-03 -0.391907 -0.612250 0.512120 -0.470910 NaN
2013-01-04 0.435786 -0.556628 -0.962486 1.964813 NaN
---------删除所有的含有NaN的行:-------------------
A B C D E
2013-01-01 0.866323 -1.852962 -0.283425 -0.914182 1.0
2013-01-02 1.104424 -1.174742 -0.185782 -0.771816 1.0
fillna
使用指定值进行填充NaN的值。
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print(df.columns)
print("---------行数据使用[0:4]之间的数据,列用df.columns,然后加个E的列:-------------------")
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
print(df1)
print("---------修改0、1列,将1赋值给0和1列,其它值为原来值:--------------------------------")
df1.loc[dates[0]:dates[1],'E'] = 1
print(df1)
print("---------使用指定值填充NaN的值:----------------------------------------------------")
print(df1.fillna(value=5))
输出结果为:
A B C D
2013-01-01 0.205656 -1.020854 -0.969127 -1.625959
2013-01-02 0.075029 1.416725 0.275302 3.273706
2013-01-03 -0.543870 0.757292 0.063623 0.589556
2013-01-04 -0.403323 -0.486899 1.031774 -0.095886
2013-01-05 2.442459 -0.044233 -0.613714 -0.389008
2013-01-06 -0.068917 -1.598378 -0.262326 -0.455274
Index(['A', 'B', 'C', 'D'], dtype='object')
---------行数据使用[0:4]之间的数据,列用df.columns,然后加个E的列:-------------------
A B C D E
2013-01-01 0.205656 -1.020854 -0.969127 -1.625959 NaN
2013-01-02 0.075029 1.416725 0.275302 3.273706 NaN
2013-01-03 -0.543870 0.757292 0.063623 0.589556 NaN
2013-01-04 -0.403323 -0.486899 1.031774 -0.095886 NaN
---------修改0、1列,将1赋值给0和1列,其它值为原来值:--------------------------------
A B C D E
2013-01-01 0.205656 -1.020854 -0.969127 -1.625959 1.0
2013-01-02 0.075029 1.416725 0.275302 3.273706 1.0
2013-01-03 -0.543870 0.757292 0.063623 0.589556 NaN
2013-01-04 -0.403323 -0.486899 1.031774 -0.095886 NaN
---------使用指定值填充NaN的值:----------------------------------------------------
A B C D E
2013-01-01 0.205656 -1.020854 -0.969127 -1.625959 1.0
2013-01-02 0.075029 1.416725 0.275302 3.273706 1.0
2013-01-03 -0.543870 0.757292 0.063623 0.589556 5.0
2013-01-04 -0.403323 -0.486899 1.031774 -0.095886 5.0
isna
得到是NaN的值的布尔值。如下:
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print("---------行数据使用[0:4]之间的数据,列用df.columns,然后加个E的列:-------------------")
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
print("---------修改0、1列,将1赋值给0和1列,其它值为原来值:--------------------------------")
df1.loc[dates[0]:dates[1],'E'] = 1
print(df1)
print("---------得到所有是布尔值的值:------------------------------")
print(pd.isna(df1))
输出结果为:
---------行数据使用[0:4]之间的数据,列用df.columns,然后加个E的列:-------------------
---------修改0、1列,将1赋值给0和1列,其它值为原来值:--------------------------------
A B C D E
2013-01-01 -0.242267 1.005525 -2.121381 -0.980096 1.0
2013-01-02 1.173845 3.088588 -0.102826 1.653409 1.0
2013-01-03 -0.451330 -1.287177 0.496383 -1.114614 NaN
2013-01-04 -1.378555 -1.119537 1.091021 -0.626668 NaN
---------得到所有是布尔值的值:------------------------------
A B C D E
2013-01-01 False False False False False
2013-01-02 False False False False False
2013-01-03 False False False False True
2013-01-04 False False False False True
求平均值mean
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("------------df.mean()按列求平均值:--------------")
print(df.mean())
print("------------df.mean(1)按行求平均值:-------------")
print(df.mean(1))
输出结果:
A B C D
2013-01-01 1.000250 0.584595 1.028464 -0.103284
2013-01-02 0.539096 1.928019 1.975352 0.421709
2013-01-03 0.009377 1.112230 -0.855621 0.942441
2013-01-04 1.610289 0.030575 0.707928 0.281609
2013-01-05 2.675944 0.286336 1.750777 -0.463347
2013-01-06 0.216954 -0.345792 0.296372 1.740097
------------df.mean求平均值:--------------
A 1.008652
B 0.599327
C 0.817212
D 0.469871
dtype: float64
2013-01-01 0.627506
2013-01-02 1.216044
2013-01-03 0.302107
2013-01-04 0.657600
2013-01-05 1.062427
2013-01-06 0.476908
Freq: D, dtype: float64
Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts along the specified dimension
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
print("----------------------------------------------------")
print(s)
print("--------------df.sub(s,axis='index')----------------")
print(df.sub(s, axis='index'))
将行数应用于数据:
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("----------列方向上,下一个元素的值是上面所有值的和-------------")
print(df.apply(np.cumsum))
print("----------行方向上,最大值 - 最小值---------------------------")
print(df.apply(lambda x: x.max() - x.min()))
输出结果:
A B C D
2013-01-01 0.638475 -0.213429 0.738899 -0.439204
2013-01-02 -0.100059 -0.911364 -0.761939 0.815867
2013-01-03 1.241903 -0.086499 -2.127611 0.695337
2013-01-04 -0.334616 -0.333722 1.458706 -0.865675
2013-01-05 1.135460 -0.913456 -0.253340 -0.784230
2013-01-06 -0.447679 0.162478 1.423114 0.908347
----------列方向上,下一个元素的值是上面所有值的和-------------
A B C D
2013-01-01 0.638475 -0.213429 0.738899 -0.439204
2013-01-02 0.538416 -1.124794 -0.023041 0.376664
2013-01-03 1.780319 -1.211293 -2.150652 1.072001
2013-01-04 1.445703 -1.545015 -0.691945 0.206326
2013-01-05 2.581163 -2.458471 -0.945286 -0.577904
2013-01-06 2.133485 -2.295993 0.477828 0.330443
----------行方向上,最大值 - 最小值---------------------------
A 1.689582
B 1.075934
C 3.586318
D 1.774022
dtype: float64
统计值出现的次数
import numpy as np
import pandas as pd
s = pd.Series(np.random.randint(0,7,size=10))
print(s)
print("--------------统计值出现的次数---------------")
print(s.value_counts())
输出结果:
0 5
1 5
2 6
3 4
4 3
5 0
6 4
7 6
8 6
9 5
dtype: int32
--------------统计值出现的次数---------------
6 3
5 3
4 2
3 1
0 1
dtype: int64