'''
【课程2.2】 Pandas数据结构Series:基本概念及创建
"一维数组"Serise
'''
'\n【课程2.2】 Pandas数据结构Series:基本概念及创建\n\n"一维数组"Serise\n\n'
import numpy as np
import pandas as pd
s=pd.Series(np.random.rand(5))
print(s)
print('------')
print(type(s))
0 0.830450
1 0.674102
2 0.528299
3 0.150878
4 0.952043
dtype: float64
------
dic = {'a':1 ,'b':'hello' , 'c':3, 4:4, 5:5}
s=pd.Series(dic)
print(s)
a 1
b hello
c 3
4 4
5 5
dtype: object
arr=np.random.randn(5)
s=pd.Series(arr)
print(arr)
print(s)
'''
pd.Series(
data=None,
index=None,
dtype=None,
name=None,
copy=False,
fastpath=False,
)
'''
s=pd.Series(arr,index=list('abcde'),dtype=np.str)
print(s)
[ 0.26383154 0.97382125 0.13994526 -0.60732141 1.32883897]
0 0.263832
1 0.973821
2 0.139945
3 -0.607321
4 1.328839
dtype: float64
a 0.26383153983884505
b 0.9738212455558085
c 0.1399452564766354
d -0.6073214101102407
e 1.3288389721491793
dtype: object
s=pd.Series(10,index=range(4))
print(s)
0 10
1 10
2 10
3 10
dtype: int64
s1=pd.Series(np.random.rand(5))
print(s1)
print('-----')
s2=pd.Series(np.random.rand(5),name="test")
print(s2)
print(s1.name,s2.name,type(s2.name))
s3=s2.rename('xjxj')
print(s3)
print(s3 is s2)
print(s3.name,s2.name)
0 0.603552
1 0.007823
2 0.581088
3 0.262479
4 0.366710
dtype: float64
-----
0 0.638098
1 0.012841
2 0.659852
3 0.009916
4 0.444856
Name: test, dtype: float64
None test
0 0.638098
1 0.012841
2 0.659852
3 0.009916
4 0.444856
Name: xjxj, dtype: float64
False
xjxj test
dic={'Jack':90.0,'Marry':92,"Tom":89,'Zack':65}
s1=pd.Series(dic,name="作业1")
print(s1)
ar=np.array((90.0,92,89,65))
s2=pd.Series(ar,index=('Jack','Marry',"Tom",'Zack'),name="作业1")
print(s2)
Jack 90.0
Marry 92.0
Tom 89.0
Zack 65.0
Name: 作业1, dtype: float64
Jack 90.0
Marry 92.0
Tom 89.0
Zack 65.0
Name: 作业1, dtype: float64
'''
【课程2.3】 Pandas数据结构Series:索引
位置下标 / 标签索引 / 切片索引 / 布尔型索引
'''
'\n【课程2.3】 Pandas数据结构Series:索引\n\n位置下标 / 标签索引 / 切片索引 / 布尔型索引\n\n'
s=pd.Series(np.random.rand(5))
print(s[0],type(s[0]),s[0].dtype)
print(float(s[0]),type(float(s[0])))
0.6358412386028008 float64
0.6358412386028008
s=pd.Series(np.random.rand(5),index=list('abcde'))
print(s)
print(s['a'],type(s['a']),s['a'].dtype)
sci=s[['a','b','c']]
print(sci,type(sci))
a 0.541327
b 0.810801
c 0.296037
d 0.794296
e 0.899370
dtype: float64
0.5413267940720663 float64
a 0.541327
b 0.810801
c 0.296037
dtype: float64
s1=pd.Series(np.random.randint(10,size=5))
s2=pd.Series(np.random.randint(10,size=5),index=list("abcde"))
print(s1[1:4],'\n',s1[1:-1],'\n',s1[2])
print('---')
print(s2["a":"b"],'\n',s2['a'])
print('---')
print(s2[1:-1],'\n',s2[2])
1 3
2 5
3 0
dtype: int32
1 3
2 5
3 0
dtype: int32
5
---
a 5
b 9
dtype: int32
5
---
b 9
c 5
d 2
dtype: int32
5
s=pd.Series(np.random.rand(3)*100)
s[4]=None
print(s)
bs1 = s > 50
bs2 = s.isnull()
bs3 = s.notnull()
print(bs1, type(bs1), bs1.dtype)
print(bs2, type(bs2), bs2.dtype)
print(bs3, type(bs3), bs3.dtype)
print('-----')
print(s>50)
print('-----')
print(s[s>50])
print('-----')
print(s[bs3])
0 12.5679
1 73.037
2 69.7116
4 None
dtype: object
0 False
1 True
2 True
4 False
dtype: bool bool
0 False
1 False
2 False
4 True
dtype: bool bool
0 True
1 True
2 True
4 False
dtype: bool bool
-----
0 False
1 True
2 True
4 False
dtype: bool
-----
1 73.037
2 69.7116
dtype: object
-----
0 12.5679
1 73.037
2 69.7116
dtype: object
s=pd.Series(np.random.rand(10)*100,index=list('abcdefghij'))
print(s)
print('-------')
print(s['b'],s['c'])
print('-------')
print(s[4:7])
print(s[[4,5,6]])
print('-------')
print(s[s>50])
a 47.610866
b 32.879041
c 60.843136
d 25.798653
e 16.734771
f 72.011496
g 13.186102
h 67.730150
i 28.785863
j 82.482446
dtype: float64
-------
32.87904131859861 60.84313579685892
-------
e 16.734771
f 72.011496
g 13.186102
dtype: float64
e 16.734771
f 72.011496
g 13.186102
dtype: float64
-------
c 60.843136
f 72.011496
h 67.730150
j 82.482446
dtype: float64
'''
【课程2.4】 Pandas数据结构Series:基本技巧
数据查看 / 重新索引 / 对齐 / 添加、修改、删除值
'''
'\n【课程2.4】 Pandas数据结构Series:基本技巧\n\n数据查看 / 重新索引 / 对齐 / 添加、修改、删除值\n\n'
s=pd.Series(np.random.rand(50))
print(s.head(10))
print(s.tail())
0 0.583793
1 0.340821
2 0.153140
3 0.726648
4 0.482695
5 0.652023
6 0.328461
7 0.177034
8 0.217062
9 0.341393
dtype: float64
45 0.425366
46 0.712421
47 0.423743
48 0.980984
49 0.146227
dtype: float64
s=pd.Series(np.random.rand(3),index=list('abc'))
print(s)
s1=s.reindex(list('cbad'))
print(s1)
s2=s.reindex(list('cbad'),fill_value ='666')
print(s2)
a 0.015252
b 0.631593
c 0.795722
dtype: float64
c 0.795722
b 0.631593
a 0.015252
d NaN
dtype: float64
c 0.795722
b 0.631593
a 0.0152518
d 666
dtype: object
s1=pd.Series(np.random.rand(3),index=list('abc'))
s2=pd.Series(np.random.rand(4),index=list('acbd'))
print(s1)
print(s2)
print(s1+s2)
a 0.277334
b 0.088585
c 0.551871
dtype: float64
a 0.222483
c 0.413765
b 0.457875
d 0.072471
dtype: float64
a 0.499817
b 0.546460
c 0.965637
d NaN
dtype: float64
s=pd.Series(np.random.rand(5),index=list("abcde"))
print(s)
s1=s.drop('a')
s2=s.drop(['a','b'],inplace=True)
print(s1)
print(s2)
print(s)
a 0.073533
b 0.141870
c 0.129170
d 0.714398
e 0.778901
dtype: float64
b 0.141870
c 0.129170
d 0.714398
e 0.778901
dtype: float64
None
c 0.129170
d 0.714398
e 0.778901
dtype: float64
s = pd.Series(np.random.rand(5), index = list('ngjur'))
s['h']=2
print(s)
s3=s.append(s)
print(s3)
print(s1)
n 0.351741
g 0.209274
j 0.333868
u 0.515066
r 0.481014
h 2.000000
dtype: float64
n 0.351741
g 0.209274
j 0.333868
u 0.515066
r 0.481014
h 2.000000
n 0.351741
g 0.209274
j 0.333868
u 0.515066
r 0.481014
h 2.000000
dtype: float64
b 0.141870
c 0.129170
d 0.714398
e 0.778901
dtype: float64
s=pd.Series(np.random.rand(3),index=['a','b','c'])
print(s)
s[0]=1
s['b']=2
print(s)
a 0.703114
b 0.998844
c 0.777766
dtype: float64
a 1.000000
b 2.000000
c 0.777766
dtype: float64
import pandas as pd
import numpy as np
s1=pd.Series(range(10),index=list('abcdefghij'))
s1.drop(index='b',inplace=True)
s1[['a','e','f']]=[100,100,100]
print(s1)
s1=pd.Series(np.random.rand(5)*10,index=list('abcde'))
s2=pd.Series(np.random.rand(5)*10,index=list('cdefg'))
print(s1+s2)
a 100
c 2
d 3
e 100
f 100
g 6
h 7
i 8
j 9
dtype: int64
a NaN
b NaN
c 2.104558
d 13.033831
e 17.870648
f NaN
g NaN
dtype: float64
'''
【课程2.5】 Pandas数据结构Dataframe:基本概念及创建
"二维数组"Dataframe:是一个表格型的数据结构,包含一组有序的列,其列的值类型可以是数值、字符串、布尔值等。
Dataframe中的数据以一个或多个二维块存放,不是列表、字典或一维数组结构。
'''
'\n【课程2.5】 Pandas数据结构Dataframe:基本概念及创建\n\n"二维数组"Dataframe:是一个表格型的数据结构,包含一组有序的列,其列的值类型可以是数值、字符串、布尔值等。\n\nDataframe中的数据以一个或多个二维块存放,不是列表、字典或一维数组结构。\n\n'
data={'name':['Jack','Tom','Mary'],
'age':[18,19,20],
'gender':['m','m','w']}
frame=pd.DataFrame(data)
print(frame)
print(type(frame))
print(frame.index,type(frame.index))
print(frame.columns,type(frame.columns))
print(frame.values,type(frame.values))
name age gender
0 Jack 18 m
1 Tom 19 m
2 Mary 20 w
RangeIndex(start=0, stop=3, step=1)
Index(['name', 'age', 'gender'], dtype='object')
[['Jack' 18 'm']
['Tom' 19 'm']
['Mary' 20 'w']]
data1={'a':[1,2,3],
'b':[3,4,5],
'c':[5,6,7]
}
data2={'one':np.random.rand(3),
'two':np.random.rand(3)
}
df1=pd.DataFrame(data1)
df2=pd.DataFrame(data2)
print(df1)
print(df2)
df1=pd.DataFrame(data1,columns=['a','b','c','d'])
print(df1)
df1=pd.DataFrame(data1,columns=['one','c'])
print(df1)
df2=pd.DataFrame(data2,index=['f1','f2','f3'])
print(df2)
a b c
0 1 3 5
1 2 4 6
2 3 5 7
one two
0 0.994078 0.172028
1 0.935417 0.363636
2 0.956896 0.686517
a b c d
0 1 3 5 NaN
1 2 4 6 NaN
2 3 5 7 NaN
one c
0 NaN 5
1 NaN 6
2 NaN 7
one two
f1 0.994078 0.172028
f2 0.935417 0.363636
f3 0.956896 0.686517
data1={'one':pd.Series(np.random.rand(2)),
'two':pd.Series(np.random.rand(3))
}
data2 = {'one':pd.Series(np.random.rand(2), index = ['a','b']),
'two':pd.Series(np.random.rand(3),index = ['a','b','c'])}
df1=pd.DataFrame(data1)
df2=pd.DataFrame(data2)
print(df1)
print(df2)
one two
0 0.348506 0.746226
1 0.614500 0.982886
2 NaN 0.296487
one two
a 0.801257 0.886317
b 0.263036 0.620650
c NaN 0.464100
ar = np.random.rand(9).reshape(3,3)
print(ar)
df1 = pd.DataFrame(ar)
df2 = pd.DataFrame(ar, index = ['a', 'b', 'c'], columns = ['one','two','three'])
print(df1)
print(df2)
[[0.38480599 0.50341925 0.4640469 ]
[0.51968901 0.83803468 0.80929611]
[0.77316926 0.38704004 0.6013333 ]]
0 1 2
0 0.384806 0.503419 0.464047
1 0.519689 0.838035 0.809296
2 0.773169 0.387040 0.601333
one two three
a 0.384806 0.503419 0.464047
b 0.519689 0.838035 0.809296
c 0.773169 0.387040 0.601333
data=[{'a':1,'b':2},{'a':1,'b':2,'c':3}]
df1=pd.DataFrame(data)
df2=pd.DataFrame(data,index=['1st','2nd'])
df3 = pd.DataFrame(data, columns = ['1','2','3'])
print(df1)
print(df2)
print(df3)
a b c
0 1 2 NaN
1 1 2 3.0
a b c
1st 1 2 NaN
2nd 1 2 3.0
1 2 3
0 NaN NaN NaN
1 NaN NaN NaN
data = {'Jack':{'math':90,'english':89,'art':78},
'Marry':{'math':82,'english':95,'art':92},
'Tom':{'math':78,'english':67}}
df1=pd.DataFrame(data)
print(df1)
df2 = pd.DataFrame(data, columns = ['Jack','Tom','Bob'])
df3 = pd.DataFrame(data, index = ['a','b','c'])
print(df2)
print(df3)
Jack Marry Tom
art 78 92 NaN
english 89 95 67.0
math 90 82 78.0
Jack Tom Bob
art 78 NaN NaN
english 89 67.0 NaN
math 90 78.0 NaN
Jack Marry Tom
a NaN NaN NaN
b NaN NaN NaN
c NaN NaN NaN
import pandas as pd
import numpy as np
lst1=np.random.randint(1,10,size=5)
lst2=np.random.randint(1,10,size=5)
lst3=np.random.randint(1,10,size=5)
lst4=np.random.randint(1,10,size=5)
dic={'four':lst1,'one':lst2,'three':lst3,'two':lst4}
df1=pd.DataFrame(dic,index=list('abcde'))
print(df1)
s1=pd.Series(data=lst1,index=list('abcde'))
s2=pd.Series(data=lst2,index=list('abcde'))
s3=pd.Series(data=lst3,index=list('abcde'))
s4=pd.Series(data=lst4,index=list('abcde'))
dic={'four':s1,'one':s2,'three':s3,'two':s4}
df2=pd.DataFrame(dic)
print(df2)
lst=[{'four':4,'one':1,"three":3,'two':2},
{'four':4,'one':1,"three":3,'two':2},
{'four':4,'one':1,"three":3,'two':2},
{'four':4,'one':1,"three":3,'two':2},
{'four':4,'one':1,"three":3,'two':2},
]
df3=pd.DataFrame(lst,index=list('abcde'))
print(df3)
ar=np.array(np.random.randint(0,10,size=(5,4)))
df4=pd.DataFrame(ar,index=list('abcde'),columns=['four','one','three','one'])
print(df4)
data = {'four':{'a':90,'b':89,'c':78,'d':5,'e':6},
'one':{'a':90,'b':89,'c':78,'d':5,'e':6},
'three':{'a':90,'b':89,'c':78,'d':5,'e':6},
'two':{'a':90,'b':89,'c':78,'d':5,'e':6}
}
df5=pd.DataFrame(data)
print(df5)
four one three two
a 9 7 4 3
b 9 8 9 4
c 8 9 5 8
d 6 5 5 4
e 2 6 9 7
four one three two
a 9 7 4 3
b 9 8 9 4
c 8 9 5 8
d 6 5 5 4
e 2 6 9 7
four one three two
a 4 1 3 2
b 4 1 3 2
c 4 1 3 2
d 4 1 3 2
e 4 1 3 2
four one three one
a 6 1 9 7
b 1 2 9 8
c 4 8 0 1
d 5 4 9 8
e 0 7 2 0
four one three two
a 90 90 90 90
b 89 89 89 89
c 78 78 78 78
d 5 5 5 5
e 6 6 6 6
'''
【课程2.6】 Pandas数据结构Dataframe:索引
Dataframe既有行索引也有列索引,可以被看做由Series组成的字典(共用一个索引)
选择列 / 选择行 / 切片 / 布尔判断
'''
'\n【课程2.6】 Pandas数据结构Dataframe:索引\n\nDataframe既有行索引也有列索引,可以被看做由Series组成的字典(共用一个索引)\n\n选择列 / 选择行 / 切片 / 布尔判断\n\n'
df=pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index=['one','two','three'],
columns=['a','b','c','d'])
print(df)
print('-----------')
data1=df['a']
data2=df[['a','c']]
print(data1,type(data1))
print(data2,type(data2))
print('-----------')
data3=df.loc['one']
data4=df.loc[['one','two']]
print(data3,type(data3))
print(data4,type(data4))
a b c d
one 60.066766 20.982593 70.080973 90.991952
two 98.299697 24.508627 47.991541 5.859387
three 61.839048 1.857317 36.815257 42.187342
-----------
one 60.066766
two 98.299697
three 61.839048
Name: a, dtype: float64
a c
one 60.066766 70.080973
two 98.299697 47.991541
three 61.839048 36.815257
-----------
a 60.066766
b 20.982593
c 70.080973
d 90.991952
Name: one, dtype: float64
a b c d
one 60.066766 20.982593 70.080973 90.991952
two 98.299697 24.508627 47.991541 5.859387
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index = ['one','two','three'],
columns = ['a','b','c','d'])
print(df)
print('-----')
data1=df['a']
data2=df[['b','c']]
print(data1)
print(data2)
data3 = df[:1]
print(data3,type(data3))
a b c d
one 66.689951 70.917802 43.959189 39.889119
two 47.674709 56.190896 5.577672 46.224369
three 3.090913 76.460138 77.731091 80.235150
-----
one 66.689951
two 47.674709
three 3.090913
Name: a, dtype: float64
b c
one 70.917802 43.959189
two 56.190896 5.577672
three 76.460138 77.731091
a b c d
one 66.689951 70.917802 43.959189 39.889119
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df1)
print(df2)
print('-----')
data1=df1.loc['one']
data2=df2.loc[1]
print(data1)
print(data2)
print('单标签索引\n-----')
data3 = df1.loc[['two','three','five']]
data4 = df2.loc[[3,2,1]]
print(data3)
print(data4)
print('多标签索引\n-----')
data5 = df1.loc['one':'three']
data6 = df2.loc[1:3]
print(data5)
print(data6)
print('切片索引')
a b c d
one 8.090184 19.719044 0.671359 17.083233
two 52.571118 65.232782 26.267374 17.837930
three 57.849170 84.111119 72.354688 80.931790
four 65.700928 4.628123 69.963978 18.142123
a b c d
0 88.288394 46.433168 70.374551 58.077134
1 29.609704 93.034972 58.228832 29.418921
2 80.638641 51.854000 10.016064 54.724819
3 31.633185 90.085637 35.456804 79.061184
-----
a 8.090184
b 19.719044
c 0.671359
d 17.083233
Name: one, dtype: float64
a 29.609704
b 93.034972
c 58.228832
d 29.418921
Name: 1, dtype: float64
单标签索引
-----
a b c d
two 52.571118 65.232782 26.267374 17.83793
three 57.849170 84.111119 72.354688 80.93179
five NaN NaN NaN NaN
a b c d
3 31.633185 90.085637 35.456804 79.061184
2 80.638641 51.854000 10.016064 54.724819
1 29.609704 93.034972 58.228832 29.418921
多标签索引
-----
a b c d
one 8.090184 19.719044 0.671359 17.083233
two 52.571118 65.232782 26.267374 17.837930
three 57.849170 84.111119 72.354688 80.931790
a b c d
1 29.609704 93.034972 58.228832 29.418921
2 80.638641 51.854000 10.016064 54.724819
3 31.633185 90.085637 35.456804 79.061184
切片索引
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: FutureWarning:
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.
See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
print(df.iloc[0])
print(df.iloc[-1])
print('单位置索引\n-----')
print(df.iloc[[0,2]])
print(df.iloc[[3,2,1]])
print('多位置索引\n-----')
print(df.iloc[1:3])
print(df.iloc[::2])
print('切片索引')
a b c d
one 63.865808 67.918670 79.147570 96.612447
two 44.578604 52.973829 93.495585 57.698461
three 89.960372 75.657540 85.973381 85.055974
four 77.390780 75.258005 47.897852 11.303472
------
a 63.865808
b 67.918670
c 79.147570
d 96.612447
Name: one, dtype: float64
a 77.390780
b 75.258005
c 47.897852
d 11.303472
Name: four, dtype: float64
单位置索引
-----
a b c d
one 63.865808 67.91867 79.147570 96.612447
three 89.960372 75.65754 85.973381 85.055974
a b c d
four 77.390780 75.258005 47.897852 11.303472
three 89.960372 75.657540 85.973381 85.055974
two 44.578604 52.973829 93.495585 57.698461
多位置索引
-----
a b c d
two 44.578604 52.973829 93.495585 57.698461
three 89.960372 75.657540 85.973381 85.055974
a b c d
one 63.865808 67.91867 79.147570 96.612447
three 89.960372 75.65754 85.973381 85.055974
切片索引
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
b1=df<20
print(b1,type(b1))
print(df[b1])
print('------')
b2 = df['a'] > 50
print(b2,type(b2))
print(df[b2])
print('------')
b3 = df[['a','b']] > 50
print(b3,type(b3))
print(df[b3])
print('------')
b4 = df.loc[['one','three']] < 50
print(b4,type(b4))
print(df[b4])
print('------')
a b c d
one 53.165072 72.137349 49.335356 94.371570
two 31.833981 18.591801 46.263265 97.524672
three 43.453369 89.621918 39.201420 18.235790
four 92.772641 12.172163 52.007352 42.280553
------
a b c d
one False False False False
two False True False False
three False False False True
four False True False False
a b c d
one NaN NaN NaN NaN
two NaN 18.591801 NaN NaN
three NaN NaN NaN 18.23579
four NaN 12.172163 NaN NaN
------
one True
two False
three False
four True
Name: a, dtype: bool
a b c d
one 53.165072 72.137349 49.335356 94.371570
four 92.772641 12.172163 52.007352 42.280553
------
a b
one True True
two False False
three False True
four True False
a b c d
one 53.165072 72.137349 NaN NaN
two NaN NaN NaN NaN
three NaN 89.621918 NaN NaN
four 92.772641 NaN NaN NaN
------
a b c d
one False False True False
three True False True True
a b c d
one NaN NaN 49.335356 NaN
two NaN NaN NaN NaN
three 43.453369 NaN 39.201420 18.23579
four NaN NaN NaN NaN
------
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
print(df)
print('------')
print(df['a'].loc)
print('------')
print(df['a'].loc[['one','three']])
print(df.loc[['one','three']]['a'])
print('------')
print(df[['b','c','d']].loc[['one','three']])
print(df[df['a']<50].iloc[:2])
a b c d
one 38.359537 40.536516 50.215082 24.690263
two 18.713505 74.621177 20.050490 47.546497
three 11.369268 65.753554 54.547829 7.002664
four 5.435595 48.839504 98.276157 21.352956
------
------
one 38.359537
three 11.369268
Name: a, dtype: float64
one 38.359537
three 11.369268
Name: a, dtype: float64
------
b c d
one 40.536516 50.215082 24.690263
three 65.753554 54.547829 7.002664
a b c d
one 38.359537 40.536516 50.215082 24.690263
two 18.713505 74.621177 20.050490 47.546497
df=pd.DataFrame(np.random.rand(4,4)*100,
index=['one','two','three','four'],
columns=[list('abcd')])
print(df)
print('------')
print(df[['b','c']])
print('------')
print(df.loc[['three','four']])
print('------')
print(df.iloc[[1,0]])
print('------')
print(df[df>50])
a b c d
one 11.775827 98.052886 2.618347 53.166143
two 48.787931 31.037028 14.595130 72.661248
three 24.311761 4.930614 60.633819 69.031862
four 33.716489 37.270337 55.445232 92.749330
------
b c
one 98.052886 2.618347
two 31.037028 14.595130
three 4.930614 60.633819
four 37.270337 55.445232
------
a b c d
three 24.311761 4.930614 60.633819 69.031862
four 33.716489 37.270337 55.445232 92.749330
------
a b c d
two 48.787931 31.037028 14.595130 72.661248
one 11.775827 98.052886 2.618347 53.166143
------
a b c d
one NaN 98.052886 NaN 53.166143
two NaN NaN NaN 72.661248
three NaN NaN 60.633819 69.031862
four NaN NaN 55.445232 92.749330
'''
【课程2.7】 Pandas数据结构Dataframe:基本技巧
数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序
'''
'\n【课程2.7】 Pandas数据结构Dataframe:基本技巧\n\n数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序\n\n'
df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,
columns = ['a','b'])
print(df.head(2))
print(df.tail())
print(df.T)
a b
0 67.123579 78.540599
1 16.668997 11.536215
a b
3 19.796982 7.295293
4 26.793664 95.924559
5 22.213018 60.779001
6 3.675648 38.955255
7 60.925783 64.103140
0 1 2 3 4 5 \
a 67.123579 16.668997 41.758860 19.796982 26.793664 22.213018
b 78.540599 11.536215 59.150321 7.295293 95.924559 60.779001
6 7
a 3.675648 60.925783
b 38.955255 64.103140
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df)
df['e']=10
df.loc[4]=20
df[['a','c']]=100
df['b'].loc[0]=666
print(df)
a b c d
0 85.971229 27.821810 77.466322 87.766945
1 41.422117 85.382627 37.848980 22.375692
2 18.105688 33.636029 10.040581 30.562783
3 50.128665 10.891343 56.333581 33.924682
a b c d e
0 100 666.000000 100 87.766945 10
1 100 85.382627 100 22.375692 10
2 100 33.636029 100 30.562783 10
3 100 10.891343 100 33.924682 10
4 100 20.000000 100 20.000000 20
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df)
print('-----')
del df['a']
print(df)
print('-----')
print(df.drop(0))
print(df.drop([1,2]))
print(df)
print('-----')
print(df.drop(['d'], axis = 1))
print(df)
a b c d
0 52.376431 98.951314 3.665786 90.532788
1 72.710362 60.761235 10.310778 20.850619
2 41.186250 48.993426 68.769189 72.845749
3 2.091445 77.853150 1.332935 1.946453
-----
b c d
0 98.951314 3.665786 90.532788
1 60.761235 10.310778 20.850619
2 48.993426 68.769189 72.845749
3 77.853150 1.332935 1.946453
-----
b c d
1 60.761235 10.310778 20.850619
2 48.993426 68.769189 72.845749
3 77.853150 1.332935 1.946453
b c d
0 98.951314 3.665786 90.532788
3 77.853150 1.332935 1.946453
b c d
0 98.951314 3.665786 90.532788
1 60.761235 10.310778 20.850619
2 48.993426 68.769189 72.845749
3 77.853150 1.332935 1.946453
-----
b c
0 98.951314 3.665786
1 60.761235 10.310778
2 48.993426 68.769189
3 77.853150 1.332935
b c d
0 98.951314 3.665786 90.532788
1 60.761235 10.310778 20.850619
2 48.993426 68.769189 72.845749
3 77.853150 1.332935 1.946453
df1=pd.DataFrame(np.random.rand(10,4),columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
print(df1)
print(df2)
print(df1+df2)
A B C D
0 0.116265 0.286147 0.509417 0.123862
1 0.591067 0.220565 0.893078 0.995741
2 0.160296 0.327076 0.372967 0.740136
3 0.444086 0.254079 0.056255 0.721923
4 0.059144 0.665892 0.504307 0.746240
5 0.909397 0.426231 0.685564 0.056044
6 0.442812 0.477964 0.208796 0.830122
7 0.573963 0.196914 0.294245 0.073122
8 0.484215 0.885544 0.493887 0.768655
9 0.450795 0.552202 0.750659 0.006042
A B C
0 1.862325 0.285321 0.505303
1 -0.664007 1.091699 -1.074902
2 -0.932387 -0.062655 0.307506
3 -2.796463 -1.310375 -1.439840
4 -0.364978 -0.376336 0.711008
5 0.705022 -1.029143 1.118433
6 -0.255299 2.005879 -0.419431
A B C D
0 1.978590 0.571468 1.014720 NaN
1 -0.072939 1.312263 -0.181824 NaN
2 -0.772091 0.264421 0.680473 NaN
3 -2.352377 -1.056296 -1.383586 NaN
4 -0.305834 0.289556 1.215315 NaN
5 1.614419 -0.602912 1.803996 NaN
6 0.187513 2.483843 -0.210635 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df1)
print(df1.sort_values(by=['a'],ascending=True))
print(df1.sort_values(['a'], ascending = False))
print('------')
df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],
'b':list(range(8)),
'c':list(range(8,0,-1))})
print(df2)
print(df2.sort_values(['a','c']))
a b c d
0 9.198370 45.091469 40.676043 74.669533
1 85.051633 98.125841 73.817642 72.571482
2 75.304951 55.935442 70.350217 4.555149
3 38.234080 62.093133 32.545653 0.724567
a b c d
0 9.198370 45.091469 40.676043 74.669533
3 38.234080 62.093133 32.545653 0.724567
2 75.304951 55.935442 70.350217 4.555149
1 85.051633 98.125841 73.817642 72.571482
a b c d
1 85.051633 98.125841 73.817642 72.571482
2 75.304951 55.935442 70.350217 4.555149
3 38.234080 62.093133 32.545653 0.724567
0 9.198370 45.091469 40.676043 74.669533
------
a b c
0 1 0 8
1 1 1 7
2 1 2 6
3 1 3 5
4 2 4 4
5 2 5 3
6 2 6 2
7 2 7 1
a b c
3 1 3 5
2 1 2 6
1 1 1 7
0 1 0 8
7 2 7 1
6 2 6 2
5 2 5 3
4 2 4 4
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = [5,4,3,2],
columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['h','s','x','g'],
columns = ['a','b','c','d'])
print(df1)
print(df1.sort_index())
print(df2)
print(df2.sort_index())
a b c d
5 84.095499 13.313575 99.483003 52.785404
4 31.363606 82.146044 48.645785 61.034812
3 42.635269 70.476056 65.823774 52.421584
2 42.028122 46.937515 92.741171 90.049750
a b c d
2 42.028122 46.937515 92.741171 90.049750
3 42.635269 70.476056 65.823774 52.421584
4 31.363606 82.146044 48.645785 61.034812
5 84.095499 13.313575 99.483003 52.785404
a b c d
h 69.897356 83.189551 67.665089 43.221810
s 32.360179 1.464245 54.805584 48.618503
x 59.188794 72.810460 51.521598 27.956133
g 21.787669 17.587755 65.334653 66.057178
a b c d
g 21.787669 17.587755 65.334653 66.057178
h 69.897356 83.189551 67.665089 43.221810
s 32.360179 1.464245 54.805584 48.618503
x 59.188794 72.810460 51.521598 27.956133
df1=pd.DataFrame(np.random.rand(3,3)*100,index=list('abc'),columns=['v1','v2','v3'])
print(df1.sort_index(ascending=False))
print('------')
print(df1.sort_values(by=['v2'],ascending=False))
print('------')
df1=pd.DataFrame(np.random.rand(5,2)*100,index=list('abcde'),columns=['v1','v2'])
df2=df1.T
df2.drop(['e'],axis=1)
v1 v2 v3
c 71.187723 52.861998 59.219050
b 54.618639 38.720337 94.535586
a 77.624794 53.408849 69.616503
------
v1 v2 v3
a 77.624794 53.408849 69.616503
c 71.187723 52.861998 59.219050
b 54.618639 38.720337 94.535586
------
|
a |
b |
c |
d |
v1 |
11.507745 |
14.183001 |
51.572311 |
62.421454 |
v2 |
33.023802 |
62.028549 |
12.476100 |
1.098652 |