pandas DataFrame

DataFrame是二维数组,是Series容器。DataFrame既有行索引,也有列索引。
行索引,即横向索引,index,0轴,axis=0
列索引,即纵向索引,columns,1轴,axis=1

DataFrame创建

#方式1,使用numpy的方法创建
t = pd.DataFrame(np.arange(12).reshape(3,4))
print(t)
>>>
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11

#指定行索引和列索引
t = pd.DataFrame(np.arange(12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
print(t)
>>>
   d  e   f   g
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11

#方式2,使用字典创建,键变成列索引
tmp_dict = {"name":["xiaobai","xiaohei"],"age":[10,20],"sex":["male","female"]}
df = pd.DataFrame(tmp_dict)
print(df)
>>>
      name  age     sex
0  xiaobai   10    male
1  xiaohei   20  female

#方式3,使用list创建
tmp_list = [{"name":"xiaobai","age":10,"sex":"male"},{"name":"xiaohei","age":20,"sex":"female"},{"name":"xiaolan","age":"15"}]
df = pd.DataFrame(tmp_list)
print(df)
>>>
      name age     sex
0  xiaobai  10    male
1  xiaohei  20  female
2  xiaolan  15     NaN

DataFrame属性

tmp_list = [{"name":"xiaobai","age":10,"sex":"male"},{"name":"xiaohei","age":20,"sex":"female"},{"name":"xiaolan","age":15}]
df = pd.DataFrame(tmp_list)
print(df)
>>>
      name  age     sex
0  xiaobai   10    male
1  xiaohei   20  female
2  xiaolan   15     NaN

print(df.index) #行索引
>>>RangeIndex(start=0, stop=3, step=1)

print(df.columns)#列索引
>>>Index(['name', 'age', 'sex'], dtype='object')

print(df.values,type(df.values))#值和值类型
>>>
[['xiaobai' 10 'male']
 ['xiaohei' 20 'female']
 ['xiaolan' 15 nan]] 

print(df.shape)#形状
>>>(3, 3)

print(df.dtypes)#每列的数据类型
>>>
name    object
age      int64
sex     object
dtype: object

print(df.ndim)#维度
>>>2

print(df.head(2))#不指定默认显示前5行
print(df.tail(2))#不指定默认显示后5行

print(df.info())#信息概览:行数量、列数量、列非空值个数、列数据类型,内存占用
>>>

RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
name    3 non-null object
age     3 non-null int64
sex     2 non-null object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes

print(df.describe())#只对数值列统计
>>>
        age
count   3.0
mean   15.0
std     5.0
min    10.0
25%    12.5
50%    15.0
75%    17.5
max    20.0

DataFrame按照某行或某列排序

tmp_list = pd.DataFrame(np.array([1,3,5,0,9,7,2,19,11,33,99,52]).reshape(3,4),columns=["A","B","C","D"],index=['a','b','c'])
df = pd.DataFrame(tmp_list)
print(df)
>>>
    A   B   C   D
a   1   3   5   0
b   9   7   2  19
c  11  33  99  52

#按照C列倒序
df = df.sort_values(by="C",ascending=False)#默认axis=0,即按照列排序
print(df)
>>>
    A   B   C   D
c  11  33  99  52
a   1   3   5   0
b   9   7   2  19

#按照b行正序
df = df.sort_values(by="b",ascending=True,axis=1)
print(df)
>>>
    C   B   A   D
a   5   3   1   0
b   2   7   9  19
c  99  33  11  52

DataFrame取行或列,示例1

tmp_list = [{"name":"xiaobai","age":10,"sex":"male"},{"name":"xiaohei","age":20,"sex":"female"},
{"name":"xiaolan","age":15},{"name":"xiaohui","age":18,"sex":"female"},{"name":"xiaohong","age":19,"sex":"female"}]
df = pd.DataFrame(tmp_list)
print(df)
>>>
       name  age     sex
0   xiaobai   10    male
1   xiaohei   20  female
2   xiaolan   15     NaN
3   xiaohui   18  female
4  xiaohong   19  female

#取name列第3行值
print(df['name'][3])
>>>
xiaohui

#取两行数据
print(df[:2])
>>>
      name  age     sex
0  xiaobai   10    male
1  xiaohei   20  female

#取name这一列数据,得到的是Series类型
print(df["name"],type(df["name"]))
>>>
0     xiaobai
1     xiaohei
2     xiaolan
3     xiaohui
4    xiaohong
Name: name, dtype: object 

#取[0,2)行的name列
print(df[:2]["name"])
>>>
0    xiaobai
1    xiaohei
Name: name, dtype: object

DataFrame取行或列,示例2

t = pd.DataFrame(np.arange(12).reshape(3,4),index=["a","b","c"],columns=["A","B","C","D"])
print(t)
>>>
   A  B   C   D
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11

#取a行B列的值
print(t.loc["a","B"])
print(type(t.loc["a","B"]))
>>>1
>>>

#取a行,列取a行的全部列,方式1
print(t.loc["a"])
print(type(t.loc["a"]))
>>>
A    0
B    1
C    2
D    3
Name: a, dtype: int32
>>>

#取a行,列取a行的全部列,方式2
print(t.loc["a",:])
print(type(t.loc["a",:]))
>>>
A    0
B    1
C    2
D    3
Name: a, dtype: int32
>>>

#取B列,行取所有行
print(t.loc[:,"B"])
print(type(t.loc[:,"B"]))
>>>
a    1
b    5
c    9
Name: B, dtype: int32
>>>

#取a行和b行,方式1
print(t.loc[["a","b"]])
>>>
   A  B  C  D
a  0  1  2  3
b  4  5  6  7

#取a行和b行,方式2
print(t.loc[["a","b"],:])
>>>
   A  B  C  D
a  0  1  2  3
b  4  5  6  7

#取A列和B列
print(t.loc[:,["A","B"]])
>>>
   A  B
a  0  1
b  4  5
c  8  9

#取a和c行  A和B列
print(t.loc[["a","c"],["A","B"]])
>>>
   A  B
a  0  1
c  8  9

#使用切片方式取a~c行,A~B列。注意:包含a、c,A、B
print(t.loc["a":"c","A":"B"])
>>>
   A  B
a  0  1
b  4  5
c  8  9

#iloc方法取所有行,第4列和第2列数据
print(t.iloc[:,[3,1]])
>>>
    D  B
a   3  1
b   7  5
c  11  9

#iloc取第1行和第2行,第4列和第2列数据
print(t.iloc[[0,2],[3,1]])
>>>
    D  B
a   3  1
c  11  9

#iloc取第2行之后,包含第2行。第3列之前,不包含第3列
print(t.iloc[1:,:2])
>>>
   A  B
b  4  5
c  8  9

#iloc取第2行之后,包含第2行。第3列之前,不包含第3列。赋值
t.iloc[1:,:2] = 100
print(t)
>>>
     A    B   C   D
a    0    1   2   3
b  100  100   6   7
c  100  100  10  11

DataFrame按条件筛选

tmp_list = [{"name":"xiaobai","age":10,"sex":"male"},{"name":"xiaohei","age":20,"sex":"female"},
{"name":"xiaolan","age":15},{"name":"xiaohui","age":18,"sex":"female"},{"name":"xiaohong","age":19,"sex":"female"}]
df = pd.DataFrame(tmp_list)
print(df)
>>>
       name  age     sex
0   xiaobai   10    male
1   xiaohei   20  female
2   xiaolan   15     NaN
3   xiaohui   18  female
4  xiaohong   19  female

#查找age>15
df1 = df[df["age"] > 15]
print(df1)
>>>
       name  age     sex
1   xiaohei   20  female
3   xiaohui   18  female
4  xiaohong   19  female

#查找age>15并且age<20
df1 = df[(df["age"] > 15)&(df["age"] < 20)]
print(df1)
>>>
       name  age     sex
3   xiaohui   18  female
4  xiaohong   19  female

#查找age>15或age<13
df1 = df[(df["age"] > 15) | (df["age"] < 13)]

#查找name字符长度>7的数据
df1 = df[df["name"].str.len() >7]
print(df1)
>>>
       name  age     sex
4  xiaohong   19  female

你可能感兴趣的:(pandas DataFrame)