DataFrame是二维数组,是Series容器。DataFrame既有行索引,也有列索引。
行索引,即横向索引,index,0轴,axis=0
列索引,即纵向索引,columns,1轴,axis=1
DataFrame创建
#方式1,使用numpy的方法创建
t = pd.DataFrame(np.arange(12).reshape(3,4))
print(t)
>>>
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
#指定行索引和列索引
t = pd.DataFrame(np.arange(12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
print(t)
>>>
d e f g
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
#方式2,使用字典创建,键变成列索引
tmp_dict = {"name":["xiaobai","xiaohei"],"age":[10,20],"sex":["male","female"]}
df = pd.DataFrame(tmp_dict)
print(df)
>>>
name age sex
0 xiaobai 10 male
1 xiaohei 20 female
#方式3,使用list创建
tmp_list = [{"name":"xiaobai","age":10,"sex":"male"},{"name":"xiaohei","age":20,"sex":"female"},{"name":"xiaolan","age":"15"}]
df = pd.DataFrame(tmp_list)
print(df)
>>>
name age sex
0 xiaobai 10 male
1 xiaohei 20 female
2 xiaolan 15 NaN
DataFrame属性
tmp_list = [{"name":"xiaobai","age":10,"sex":"male"},{"name":"xiaohei","age":20,"sex":"female"},{"name":"xiaolan","age":15}]
df = pd.DataFrame(tmp_list)
print(df)
>>>
name age sex
0 xiaobai 10 male
1 xiaohei 20 female
2 xiaolan 15 NaN
print(df.index) #行索引
>>>RangeIndex(start=0, stop=3, step=1)
print(df.columns)#列索引
>>>Index(['name', 'age', 'sex'], dtype='object')
print(df.values,type(df.values))#值和值类型
>>>
[['xiaobai' 10 'male']
['xiaohei' 20 'female']
['xiaolan' 15 nan]]
print(df.shape)#形状
>>>(3, 3)
print(df.dtypes)#每列的数据类型
>>>
name object
age int64
sex object
dtype: object
print(df.ndim)#维度
>>>2
print(df.head(2))#不指定默认显示前5行
print(df.tail(2))#不指定默认显示后5行
print(df.info())#信息概览:行数量、列数量、列非空值个数、列数据类型,内存占用
>>>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
name 3 non-null object
age 3 non-null int64
sex 2 non-null object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes
print(df.describe())#只对数值列统计
>>>
age
count 3.0
mean 15.0
std 5.0
min 10.0
25% 12.5
50% 15.0
75% 17.5
max 20.0
DataFrame按照某行或某列排序
tmp_list = pd.DataFrame(np.array([1,3,5,0,9,7,2,19,11,33,99,52]).reshape(3,4),columns=["A","B","C","D"],index=['a','b','c'])
df = pd.DataFrame(tmp_list)
print(df)
>>>
A B C D
a 1 3 5 0
b 9 7 2 19
c 11 33 99 52
#按照C列倒序
df = df.sort_values(by="C",ascending=False)#默认axis=0,即按照列排序
print(df)
>>>
A B C D
c 11 33 99 52
a 1 3 5 0
b 9 7 2 19
#按照b行正序
df = df.sort_values(by="b",ascending=True,axis=1)
print(df)
>>>
C B A D
a 5 3 1 0
b 2 7 9 19
c 99 33 11 52
DataFrame取行或列,示例1
tmp_list = [{"name":"xiaobai","age":10,"sex":"male"},{"name":"xiaohei","age":20,"sex":"female"},
{"name":"xiaolan","age":15},{"name":"xiaohui","age":18,"sex":"female"},{"name":"xiaohong","age":19,"sex":"female"}]
df = pd.DataFrame(tmp_list)
print(df)
>>>
name age sex
0 xiaobai 10 male
1 xiaohei 20 female
2 xiaolan 15 NaN
3 xiaohui 18 female
4 xiaohong 19 female
#取name列第3行值
print(df['name'][3])
>>>
xiaohui
#取两行数据
print(df[:2])
>>>
name age sex
0 xiaobai 10 male
1 xiaohei 20 female
#取name这一列数据,得到的是Series类型
print(df["name"],type(df["name"]))
>>>
0 xiaobai
1 xiaohei
2 xiaolan
3 xiaohui
4 xiaohong
Name: name, dtype: object
#取[0,2)行的name列
print(df[:2]["name"])
>>>
0 xiaobai
1 xiaohei
Name: name, dtype: object
DataFrame取行或列,示例2
t = pd.DataFrame(np.arange(12).reshape(3,4),index=["a","b","c"],columns=["A","B","C","D"])
print(t)
>>>
A B C D
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
#取a行B列的值
print(t.loc["a","B"])
print(type(t.loc["a","B"]))
>>>1
>>>
#取a行,列取a行的全部列,方式1
print(t.loc["a"])
print(type(t.loc["a"]))
>>>
A 0
B 1
C 2
D 3
Name: a, dtype: int32
>>>
#取a行,列取a行的全部列,方式2
print(t.loc["a",:])
print(type(t.loc["a",:]))
>>>
A 0
B 1
C 2
D 3
Name: a, dtype: int32
>>>
#取B列,行取所有行
print(t.loc[:,"B"])
print(type(t.loc[:,"B"]))
>>>
a 1
b 5
c 9
Name: B, dtype: int32
>>>
#取a行和b行,方式1
print(t.loc[["a","b"]])
>>>
A B C D
a 0 1 2 3
b 4 5 6 7
#取a行和b行,方式2
print(t.loc[["a","b"],:])
>>>
A B C D
a 0 1 2 3
b 4 5 6 7
#取A列和B列
print(t.loc[:,["A","B"]])
>>>
A B
a 0 1
b 4 5
c 8 9
#取a和c行 A和B列
print(t.loc[["a","c"],["A","B"]])
>>>
A B
a 0 1
c 8 9
#使用切片方式取a~c行,A~B列。注意:包含a、c,A、B
print(t.loc["a":"c","A":"B"])
>>>
A B
a 0 1
b 4 5
c 8 9
#iloc方法取所有行,第4列和第2列数据
print(t.iloc[:,[3,1]])
>>>
D B
a 3 1
b 7 5
c 11 9
#iloc取第1行和第2行,第4列和第2列数据
print(t.iloc[[0,2],[3,1]])
>>>
D B
a 3 1
c 11 9
#iloc取第2行之后,包含第2行。第3列之前,不包含第3列
print(t.iloc[1:,:2])
>>>
A B
b 4 5
c 8 9
#iloc取第2行之后,包含第2行。第3列之前,不包含第3列。赋值
t.iloc[1:,:2] = 100
print(t)
>>>
A B C D
a 0 1 2 3
b 100 100 6 7
c 100 100 10 11
DataFrame按条件筛选
tmp_list = [{"name":"xiaobai","age":10,"sex":"male"},{"name":"xiaohei","age":20,"sex":"female"},
{"name":"xiaolan","age":15},{"name":"xiaohui","age":18,"sex":"female"},{"name":"xiaohong","age":19,"sex":"female"}]
df = pd.DataFrame(tmp_list)
print(df)
>>>
name age sex
0 xiaobai 10 male
1 xiaohei 20 female
2 xiaolan 15 NaN
3 xiaohui 18 female
4 xiaohong 19 female
#查找age>15
df1 = df[df["age"] > 15]
print(df1)
>>>
name age sex
1 xiaohei 20 female
3 xiaohui 18 female
4 xiaohong 19 female
#查找age>15并且age<20
df1 = df[(df["age"] > 15)&(df["age"] < 20)]
print(df1)
>>>
name age sex
3 xiaohui 18 female
4 xiaohong 19 female
#查找age>15或age<13
df1 = df[(df["age"] > 15) | (df["age"] < 13)]
#查找name字符长度>7的数据
df1 = df[df["name"].str.len() >7]
print(df1)
>>>
name age sex
4 xiaohong 19 female