Dataframe既有行索引也有列索引,可以被看做由Series组成的字典(共用一个索引)
选择列 / 选择行 / 切片 / 布尔判断
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index = ['one','two','three'],
columns = ['a','b','c','d'])
print(df)
data1 = df['a']
data2 = df[['a','c']]
print(data1,type(data1))
print(data2,type(data2))
print('-----')
# 按照列名选择列,只选择一列输出Series,选择多列输出Dataframe
data3 = df.loc['one']
data4 = df.loc[['one','two']]
print(data2,type(data3))
print(data3,type(data4))
# 按照index选择行,只选择一行输出Series,选择多行输出Dataframe
-----------------------------------------------------------------------
# 一般用于选择列,也可以选择行
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index = ['one','two','three'],
columns = ['a','b','c','d'])
print(df)
print('-----')
data1 = df['a']
data2 = df[['b','c']] # 尝试输入 data2 = df[['b','c','e']]
print(data1)
print(data2)
# df[]默认选择列,[]中写列名(所以一般数据colunms都会单独制定,不会用默认数字列名,以免和index冲突)
# 单选列为Series,print结果为Series格式
# 多选列为Dataframe,print结果为Dataframe格式
data3 = df[:1]
#data3 = df[0]
#data3 = df['one']
print(data3,type(data3))
# df[]中为数字时,默认选择行,且只能进行切片的选择,不能单独选择(df[0])
# 输出结果为Dataframe,即便只选择一行
# df[]不能通过索引标签名来选择行(df['one'])
# 核心笔记:df[col]一般用于选择列,[]中写列名
-----------------------------------------------------------------------
a b c d
one 88.490183 93.588825 1.605172 74.610087
two 45.905361 49.257001 87.852426 97.490521
three 95.801001 97.991028 74.451954 64.290587
-----
one 88.490183
two 45.905361
three 95.801001
Name: a, dtype: float64
b c
one 93.588825 1.605172
two 49.257001 87.852426
three 97.991028 74.451954
a b c d
one 88.490183 93.588825 1.605172 74.610087 <class 'pandas.core.frame.DataFrame'>
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df1)
print(df2)
print('-----')
data1 = df1.loc['one']
data2 = df2.loc[1]
print(data1)
print(data2)
print('单标签索引\n-----')
# 单个标签索引,返回Series
data3 = df1.loc[['two','three','five']]
data4 = df2.loc[[3,2,1]]
print(data3)
print(data4)
print('多标签索引\n-----')
# 多个标签索引,如果标签不存在,则返回NaN
# 顺序可变
data5 = df1.loc['one':'three']
data6 = df2.loc[1:3]
print(data5)
print(data6)
print('切片索引')
# 可以做切片对象
# 末端包含
# 核心笔记:df.loc[label]主要针对index选择行,同时支持指定index,及默认数字index
-----------------------------------------------------------------------
one 73.070679 7.169884 80.820532 62.299367
two 34.025462 77.849955 96.160170 55.159017
three 27.897582 39.595687 69.280955 49.477429
four 76.723039 44.995970 22.408450 23.273089
a b c d
0 93.871055 28.031989 57.093181 34.695293
1 22.882809 47.499852 86.466393 86.140909
2 80.840336 98.120735 84.495414 8.413039
3 59.695834 1.478707 15.069485 48.775008
-----
a 73.070679
b 7.169884
c 80.820532
d 62.299367
Name: one, dtype: float64
a 22.882809
b 47.499852
c 86.466393
d 86.140909
Name: 1, dtype: float64
单标签索引
-----
a b c d
two 34.025462 77.849955 96.160170 55.159017
three 27.897582 39.595687 69.280955 49.477429
five NaN NaN NaN NaN
a b c d
3 59.695834 1.478707 15.069485 48.775008
2 80.840336 98.120735 84.495414 8.413039
1 22.882809 47.499852 86.466393 86.140909
多标签索引
-----
a b c d
one 73.070679 7.169884 80.820532 62.299367
two 34.025462 77.849955 96.160170 55.159017
three 27.897582 39.595687 69.280955 49.477429
a b c d
1 22.882809 47.499852 86.466393 86.140909
2 80.840336 98.120735 84.495414 8.413039
3 59.695834 1.478707 15.069485 48.775008
切片索引
# 类似list的索引,其顺序就是dataframe的整数位置,从0开始计
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
print(df.iloc[0])
print(df.iloc[-1])
#print(df.iloc[4])
print('单位置索引\n-----')
# 单位置索引
# 和loc索引不同,不能索引超出数据行数的整数位置
print(df.iloc[[0,2]])
print(df.iloc[[3,2,1]])
print('多位置索引\n-----')
# 多位置索引
# 顺序可变
print(df.iloc[1:3])
print(df.iloc[::2])
print('切片索引')
# 切片索引
# 末端不包含
-----------------------------------------------------------------------
a b c d
one 21.848926 2.482328 17.338355 73.014166
two 99.092794 0.601173 18.598736 61.166478
three 87.183015 85.973426 48.839267 99.930097
four 75.007726 84.208576 69.445779 75.546038
------
a 21.848926
b 2.482328
c 17.338355
d 73.014166
Name: one, dtype: float64
a 75.007726
b 84.208576
c 69.445779
d 75.546038
Name: four, dtype: float64
单位置索引
-----
a b c d
one 21.848926 2.482328 17.338355 73.014166
three 87.183015 85.973426 48.839267 99.930097
a b c d
four 75.007726 84.208576 69.445779 75.546038
three 87.183015 85.973426 48.839267 99.930097
two 99.092794 0.601173 18.598736 61.166478
多位置索引
-----
a b c d
two 99.092794 0.601173 18.598736 61.166478
three 87.183015 85.973426 48.839267 99.930097
a b c d
one 21.848926 2.482328 17.338355 73.014166
three 87.183015 85.973426 48.839267 99.930097
切片索引
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
b1 = df < 20
print(b1,type(b1))
print(df[b1]) # 也可以书写为 df[df < 20]
print('------')
# 不做索引则会对数据每个值进行判断
# 索引结果保留 所有数据:True返回原数据,False返回值为NaN
b2 = df['a'] > 50
print(b2,type(b2))
print(df[b2]) # 也可以书写为 df[df['a'] > 50]
print('------')
# 单列做判断
# 索引结果保留 单列判断为True的行数据,包括其他列
b3 = df[['a','b']] > 50
print(b3,type(b3))
print(df[b3]) # 也可以书写为 df[df[['a','b']] > 50]
print('------')
# 多列做判断
# 索引结果保留 所有数据:True返回原数据,False返回值为NaN
b4 = df.loc[['one','three']] < 50
print(b4,type(b4))
print(df[b4]) # 也可以书写为 df[df.loc[['one','three']] < 50]
print('------')
# 多行做判断
# 索引结果保留 所有数据:True返回原数据,False返回值为NaN
-----------------------------------------------------------------------
a b c d
one 19.185849 20.303217 21.800384 45.189534
two 50.105112 28.478878 93.669529 90.029489
three 35.496053 19.248457 74.811841 20.711431
four 24.604478 57.731456 49.682717 82.132866
------
a b c d
one True False False False
two False False False False
three False True False False
four False False False False <class 'pandas.core.frame.DataFrame'>
a b c d
one 19.185849 NaN NaN NaN
two NaN NaN NaN NaN
three NaN 19.248457 NaN NaN
four NaN NaN NaN NaN
------
one False
two True
three False
four False
Name: a, dtype: bool <class 'pandas.core.series.Series'>
a b c d
two 50.105112 28.478878 93.669529 90.029489
------
a b
one False False
two True False
three False False
four False True <class 'pandas.core.frame.DataFrame'>
a b c d
one NaN NaN NaN NaN
two 50.105112 NaN NaN NaN
three NaN NaN NaN NaN
four NaN 57.731456 NaN NaN
------
a b c d
one True True True True
three True True False True <class 'pandas.core.frame.DataFrame'>
a b c d
one 19.185849 20.303217 21.800384 45.189534
two NaN NaN NaN NaN
three 35.496053 19.248457 NaN 20.711431
four NaN NaN NaN NaN
------
# 先选择列再选择行 —— 相当于对于一个数据,先筛选字段,再选择数据量
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
print(df['a'].loc[['one','three']]) # 选择a列的one,three行
print(df[['b','c','d']].iloc[::2]) # 选择b,c,d列的one,three行
print(df[df['a'] < 50].iloc[:2]) # 选择满足判断索引的前两行数据
-----------------------------------------------------------------------
a b c d
one 50.660904 89.827374 51.096827 3.844736
two 70.699721 78.750014 52.988276 48.833037
three 33.653032 27.225202 24.864712 29.662736
four 21.792339 26.450939 6.122134 52.323963
------
one 50.660904
three 33.653032
Name: a, dtype: float64
b c d
one 89.827374 51.096827 3.844736
three 27.225202 24.864712 29.662736
a b c d
three 33.653032 27.225202 24.864712 29.662736
four 21.792339 26.450939 6.122134 52.323963
作业1:如图创建Dataframe(4*4,值为0-100的随机数),通过索引得到以下值
① 索引得到b,c列的所有值
② 索引得到第三第四行的数据
③ 按顺序索引得到two,one行的值
④ 索引得到大于50的值
df = df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print('创建Dataframe为:\n',df,'\n-------')
print('b,c列的所有值为:\n', df[['b','c']],'\n------')
print('第三第四行的值为:\n', df[2:4],'\n------')
print('按顺序索引得到two,one行的值为:\n', df.loc[['two','one']],'\n------')
print('大于50的值为:\n', df[df>50],'\n------')