import pandas as pd
import numpy as np
# 第一种:直接用数组构建
s=pd.Series([1,2,3,4])
# 第二种:可以输入np数组
s=pd.Series(np.arange(1,5))
# 第三种:可以用字典创建
s=pd.Series({key:val})
# 第四种:可以通过标量创建
s=pd.Series(5,index=[0,1,2,3])
s=pd.Series(np.arange(1,5),index=list('abcd'))
a 1
b 2
c 3
d 4
dtype: int64
----------------------------------------------------------
# 查看索引
s.index
Index(['a', 'b', 'c', 'd'], dtype='object') # dtype='object'是字符类型
# 查看值
s.values
array([1, 2, 3, 4], dtype=int64)
# 根据索引查看特定的值
s['b'] # 输出2
s[1] # 输出2 需要注意:Series中自定义索引和自动索引并存,但是不能混合使用
s[['a','b']]
a 1
b 2
dtype: int32 # 这里需要注意,如果需要用到多个索引,需要以列表的形式传入
s=pd.Series(np.arange(9,5,-1),index=list('abcd'))
s.name='Series对象'
s.index.name='索引列'
索引列
a 9
b 8
c 7
d 6
Name: series对象, dtype: int64
s=pd.Series(np.arange(1,5),index=list('abcd'))
s>2
a False
b False
c True
d True
dtype: bool # 这里输出的是每个元素对于>2的布尔值
s[s>2]
c 3
d 4
dtype: int64 # 这里输出的就是满足条件的元素
s=pd.Series(np.arange(1,5),index=list('abcd'))
s1=pd.Series(np.arange(5,8),index=list('afc'))
s+s1
a 6.0
b NaN
c 10.0
d NaN
f NaN
dtype: float64 # 可以看到,s中索引为a,b,c,d, s1索引为a,f,c,则a,c索引元素相 加,其余置为NaN
import numpy as np
import pandas as pd
d=pd.DataFrame(np.arange(10).reshape(2,5))
0 1 2 3 4 <----------自动列索引
0 0 1 2 3 4 (0,1)是自动行索引
1 5 6 7 8 9
-----------------------------------------------------------
dt={'one':pd.Series([1,2,3],index=list('abc')),
'two':pd.Series([9,8,7,6],index=list('abcd'))}
d=pd.DataFrame(dt)
one two
a 1.0 9
b 2.0 8
c 3.0 7
d NaN 6
------------------------------------------------------------
d=pd.DataFrame(dt,index=list('bcd'),columns=['two','three'])
two three
b 8 NaN
c 7 NaN
d 6 NaN
-------------------------------------------------------------
d1={'one':[1,2,3,4],'two':[9,8,7,6]}
d=pd.DataFrame(d1,index=list('abcd'))
one two
a 1 9
b 2 8
c 3 7
d 4 6
DataFrame的单列数据为一个Series,根据DataFrame的定义可以知道,DataFrame是一个带有标签的二维数组,每个标签相当每一列的列名
重新索引
.reindex(index=None,columns=None…) 能够改变或重排Series和DataFrame索引
d1={"one":[1,2,3,4],"two":[9,8,7,6]}
d=pd.DataFrame(d1,index=["a","b","c","d"])
d=d.reindex(index=["d","c","b","a"])
one two
d 4 6
c 3 7
b 2 8
a 1 9
d={'one':pd.Series([1,2,3],index=list('abc')),
'two':pd.Series([1,2,3,4],index=list('abcd'))}
df=pd.DataFrame(d)
df['one'] / df.one # 二者一个意思
df[['one','two']]
DataFrame类型的行访问可以像列表切片一样
或者用.loc属性 .loc[‘索引名’]
或者用.iloc属性 (.iloc=.indexlocation),用的是默认的索引,也就是[0,1,2,3,4…]
需要注意
在使用.loc[]属性取进行切片取行的时候,例如df.loc[‘a’:‘c’] ,取到的行包含 c 所在的那一行
df[0:2]
one two
a 1.0 1
b 2.0 2
--------------------------------------
df.loc['b'] # 取一行
df.loc[['b','c']] # 取多行
one 2.0
two 2.0
Name: b, dtype: float64
--------------------------------------
df.iloc[1]
one 2.0
two 2.0
Name: b, dtype: float64
data={'name':['tom','jack','steve','ricky'],
'age':[23,25,27,28]}
index=['s1','s2','s3','s4']
df=pd.DataFrame(data,index=index)
df['score']=pd.Series([90,80,70,60],index=index)
Name Age score
s1 Tom 28 90
s2 Jack 34 80
s3 Steve 29 70
s4 Ricky 42 60
d={'one':pd.Series([1,2,3],index=list('abc')),
'two':pd.Series([1,2,3,4],index=list('abcd')),
'three':pd.Series([10,20,30],index=list('abc'))}
df=pd.DataFrame(d)
# 删除一列
del(df['one'])
# 调用pop删除一列
df.pop('one')
df=pd.DataFrame([['zs',12],['ls',4]],columns=['name','age'])
df2=pd.DataFrame([['ww',16],['zl',8]],columns=['name','age'])
df=df.append(df2) # 行添加
df=df.drop(0) # 删除第0行,根据默认索引删除
df=df.drop(['zs','ww']) # 根据给定索引删除多行
import pandas as pd
df = pd.DataFrame([['zs', 12], ['ls', 4]], columns = ['Name','Age'])
df2 = pd.DataFrame([['ww', 16], ['zl', 8]], columns = ['Name','Age'])
df = df.append(df2)
df['Name'][0] = 'Tom' # 修改'Name'列的第0个元素,重新赋值为'Tom'
print(df)
unsorted_df=pd.DataFrame(np.arange(1,21).reshape(10,2),
index=[1,4,6,2,3,5,9,8,0,7],
columns=['col2','col1'])
col2 col1
1 1 2
4 3 4
6 5 6
2 7 8
3 9 10
5 11 12
9 13 14
8 15 16
0 17 18
7 19 20
sorted_df=unsorted_df.sort_index()
col2 col1
0 17 18
1 1 2
2 7 8
3 9 10
4 3 4
5 11 12
6 5 6
7 19 20
8 15 16
9 13 14
sorted_df=unsorted_df.sort_index(axis=1)
col1 col2
1 2 1
4 4 3
6 6 5
2 8 7
3 10 9
5 12 11
9 14 13
8 16 15
0 18 17
7 20 19
sort_values()
是按值排序的方法,它接受一个by
参数,它将使用要与其排序值的DataFrame
的列名称sorted_df=unsotred_df.sort_values(by='col1',ascending=False)
col2 col1
7 19 20
0 17 18
8 15 16
9 13 14
5 11 12
3 9 10
2 7 8
6 5 6
4 3 4
1 1 2
------------------------------------------------------------------
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Minsu','Jack',
'Lee','David','Gasper','Betina','Andres']),
'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}
unsorted_df = pd.DataFrame(d)
# 按照年龄进行排序
sorted_df = unsorted_df.sort_values(by='Age')
print (sorted_df)
# 先按Age进行升序排序,然后按Rating降序排序
sorted_df = unsorted_df.sort_values(by=['Age', 'Rating'], ascending=[True, False])
sort_index()/sort_values()
两种,如果要看当前数字的排名,可以使用rank()方法,返回的是当前索引对应值的排名,如果rank的结果中有小数,表明有相同的数,小数是取的平均值,可以使用 rank(method='first')
方法,将先出现的数字排在前面算数运算法则
算数运算根据行、列索引,补齐后运算,运算默认产生浮点数
补齐时缺项填充NaN(空值)
二维和一维,一维和零维之间为广播运算
b=pd.DataFrame(np.arange(20).reshape(4,5))
c=pd.Series(np.arange(4))
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
-------------------------------------------
0 0
1 1
2 2
3 3
dtype: int32
-------------------------------------------
0 1 2 3 4
0 0.0 0.0 0.0 0.0 NaN
1 5.0 5.0 5.0 5.0 NaN
2 10.0 10.0 10.0 10.0 NaN
3 15.0 15.0 15.0 15.0 NaN
import pandas as pd
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)
Team Rank Year Points
0 Riders 1 2014 876
1 Riders 2 2015 789
2 Devils 2 2014 863
3 Devils 3 2015 673
4 Kings 3 2014 741
5 kings 4 2015 812
6 Kings 1 2016 756
7 Kings 1 2017 788
8 Riders 2 2016 694
9 Royals 4 2014 701
10 Royals 1 2015 804
11 Riders 2 2017 690
df.groupby('Year')
# 需要注意,返回的是一个DataFrameGroupBy对象,该对象是可迭代的,可用for循环遍历
#
grouped=df.groupby('Year')
for year,group in grouped:
print(year)
print(group)
2014
Team Rank Year Points
0 Riders 1 2014 876
2 Devils 2 2014 863
4 Kings 3 2014 741
9 Royals 4 2014 701
2015
Team Rank Year Points
1 Riders 2 2015 789
3 Devils 3 2015 673
5 kings 4 2015 812
10 Royals 1 2015 804
2016
Team Rank Year Points
6 Kings 1 2016 756
8 Riders 2 2016 694
2017
Team Rank Year Points
7 Kings 1 2017 788
11 Riders 2 2017 690
-----------------------------------------------------------------
df.groupby('Year').groups # 返回的是一个字典,键就是年份,值就是对应的索引
{2014: Int64Index([0, 2, 4, 9], dtype='int64'), 2015: Int64Index([1, 3, 5, 10], dtype='int64'), 2016: Int64Index([6, 8], dtype='int64'), 2017: Int64Index([7, 11], dtype='int64')}
groupby
对象,就可以对每个分组数据执行求和,求标准差等操作grouped=df.groupby('Year')
grouped['Points'].agg(np.mean)
agg=grouped['Points'].agg(np.sum,np.mean,np.std)
merge()
函数,作为DataFrame对象之间所有标准数据库连接操作的入口合并方法 | SQL等效 | 描述 |
---|---|---|
left |
LEFT OUTER JOIN |
使用左侧对象的键 |
right |
RIGHT OUTER JOIN |
使用右侧对象的键 |
outer |
FULL OUTER JOIN |
使用键的联合 |
inner |
INNER JOIN |
使用键的交集 |
data1=pd.DataFrame({'one':['a','b','a','a','c','b'],'two1':range(6)})
data2=pd.DataFrame({'one':['a','b','c','d'],'tow2':range(10,14)})
one two1
0 a 0
1 b 1
2 a 2
3 a 3
4 c 4
5 b 5
-----------------------------------------------------------
one tow2
0 a 10
1 b 11
2 c 12
3 d 13
merge()
合并,也就是默认merge(data1,data2,how='inner')
的情况pd.merge(data1,data2)
one two1 tow2
0 a 0 10
1 a 2 10
2 a 3 10
3 b 1 11
4 b 5 11
5 c 4 12
merge(data1,data2,how='outer')
,'outer’参数表示合并的时候使用键的联合,可以看到,二者的共同外键是a,b,c
,所以在two1列 d 是NaNpd.merge(data_1,data_2,how='outer')
one two1 two2
0 a 0.0 10
1 a 2.0 10
2 a 3.0 10
3 b 1.0 11
4 b 5.0 11
5 c 4.0 12
6 d NaN 13
merge(data1,data2,how='left')
,使用左侧对象的键pd.merge(data1,data2,how='left')
one two1 tow2
0 a 0 10
1 b 1 11
2 a 2 10
3 a 3 10
4 c 4 12
5 b 5 11
merge(data1,data2,how='right')
,使用右侧对象的键pd.merge(data1,data2,how='right')
one two1 tow2
0 a 0.0 10
1 a 2.0 10
2 a 3.0 10
3 b 1.0 11
4 b 5.0 11
5 c 4.0 12
6 d NaN 13
pd.merge(data1,data2,left_index=True,right_index=True)
,同时使用左侧和右侧索引pd.merge(data1,data2,left_index=True,right_index=True)
one_x two1 one_y tow2
0 a 0 a 10
1 b 1 b 11
2 a 2 c 12
3 a 3 d 13
import pandas as pd
left = pd.DataFrame({
'student_id':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
'student_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung', 'Billy', 'Brian', 'Bran', 'Bryce', 'Betty', 'Emma', 'Marry', 'Allen', 'Jean', 'Rose', 'David', 'Tom', 'Jack', 'Daniel', 'Andrew'],
'class_id':[1,1,1,2,2,2,3,3,3,4,1,1,1,2,2,2,3,3,3,2],
'gender':['M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F'],
'age':[20,21,22,20,21,22,23,20,21,22,20,21,22,23,20,21,22,20,21,22],
'score':[98,74,67,38,65,29,32,34,85,64,52,38,26,89,68,46,32,78,79,87]})
right = pd.DataFrame(
{'class_id':[1,2,3,5],
'class_name': ['ClassA', 'ClassB', 'ClassC', 'ClassE']})
# 合并两个DataFrame
data = pd.merge(left,right)
student_id student_name class_id gender age score class_name
0 1 Alex 1 M 20 98 ClassA
1 2 Amy 1 M 21 74 ClassA
2 3 Allen 1 F 22 67 ClassA
3 11 Emma 1 F 20 52 ClassA
4 12 Marry 1 F 21 38 ClassA
5 13 Allen 1 M 22 26 ClassA
6 4 Alice 2 F 20 38 ClassB
7 5 Ayoung 2 M 21 65 ClassB
8 6 Billy 2 M 22 29 ClassB
9 14 Jean 2 M 23 89 ClassB
10 15 Rose 2 F 20 68 ClassB
11 16 David 2 F 21 46 ClassB
12 20 Andrew 2 F 22 87 ClassB
13 7 Brian 3 F 23 32 ClassC
14 8 Bran 3 F 20 34 ClassC
15 9 Bryce 3 M 21 85 ClassC
16 17 Tom 3 M 22 32 ClassC
17 18 Jack 3 M 20 78 ClassC
18 19 Daniel 3 F 21 79 ClassC
class_id
与gender
做分组汇总数据,默认聚合统计所有列data.pivot_table(index=['class_id','gender'])
age score student_id
class_id gender
1 F 21.000000 52.333333 8.666667
M 21.000000 66.000000 5.333333
2 F 20.750000 59.750000 13.750000
M 22.000000 61.000000 8.333333
3 F 21.333333 48.333333 11.333333
M 21.000000 65.000000 14.666667
class_id
与gender
做分组汇总数据,聚合统计score列data.pivot_table(index=['class_id','gender'],values=['score'])
score
class_id gender
1 F 52.333333
M 66.000000
2 F 59.750000
M 61.000000
3 F 48.333333
M 65.000000
class_id
与gender
做分组汇总数据,聚合统计score列,针对age的每个值列级分组统计data.pivot_table(index=['class_id','gender'],values=['score'],columns=['age'])
score
age 20 21 22 23
class_id gender
1 F 52.0 38.0 67.0 NaN
M 98.0 74.0 26.0 NaN
2 F 53.0 46.0 87.0 NaN
M NaN 65.0 29.0 89.0
3 F 34.0 79.0 NaN 32.0
M 78.0 85.0 32.0 NaN
class_id
与gender
做分组汇总数据,聚合统计score列,针对age每个值列分组统计,添加行、列小计data.pivot_table(index=['class_id','gender'],values=['score'],columns=['age'],margins=True)
score
age 20 21 22 23 All
class_id gender
1 F 52.000000 38.0 67.0 NaN 52.333333
M 98.000000 74.0 26.0 NaN 66.000000
2 F 53.000000 46.0 87.0 NaN 59.750000
M NaN 65.0 29.0 89.0 61.000000
3 F 34.000000 79.0 NaN 32.0 48.333333
M 78.000000 85.0 32.0 NaN 65.000000
All 61.333333 64.5 48.2 60.5 58.789474
class_id
与gender
做分组汇总数据,聚合统计score列,针对age的每个值列级分组统计,添加行、列小计data.pivot_table(index=['class_id','gender'],values=['score'],columns=['age'],margins=True,aggfunc='max')
score
age 20 21 22 23 All
class_id gender
1 F 52.0 38.0 67.0 NaN 67
M 98.0 74.0 26.0 NaN 98
2 F 68.0 46.0 87.0 NaN 87
M NaN 65.0 29.0 89.0 89
3 F 34.0 79.0 NaN 32.0 79
M 78.0 85.0 32.0 NaN 85
All 98.0 85.0 87.0 89.0 98
pd.crosstab(data.class_id,data.gender,margins=True)
gender F M All
class_id
1 3 3 6
2 4 3 7
3 3 3 6
All 10 9 19
data=pd.DataFrame({'k1':['one']*3+['two']*2},'k2':[1,1,2,3,3])
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
----------------------------------------------------
data.duplicated()
0 False
1 True
2 False
3 False
4 True
dtype: bool
duplicated()
找到bool值,然后通过掩码,拿到重复项,注意看上面的掩码,duplicated()
掩出来的是重复的bool值,如果想要拿到非重复项,取反即可data[data.duplicated()]
k1 k2
1 one 1
4 two 3
----------------------------------------
data[~data.duplicated()]
k1 k2
0 one 1
2 one 2
3 two 3
.drop_duplicateds()
方法,删除重复值,和上面的去重不同,去重是通过掩码.drop_duplicates()
方法也没改变原数据,但是可以通过inplace=True
,直接改变原数据,但是方法会返回Nonedata.drop_duplicates()
k1 k2
0 one 1
2 one 2
3 two 3
data['k3']=1
data.drop_duplicates(['k2','k3'])
k1 k2 k3
0 one 1 1
2 one 2 1
3 two 3 1
data.replace(3,np.nan) # 将3替换成nan
str
方法str.replace('a','b')
将a元素替换成b元素str.contain('xxx')
查找包含xxx的元素str.split()
用法同字符串split()data=pd.DataFrame({'k1':['beijing,haidian,chushou','beijing,chaoyang,chushou','beijing,fengtai,chuzu'],'k2':['beijing,xicheng,chuzu','beijing,shijingshan,chushou',np.nan]})
->
k1 k2
0 beijing,haidian,chushou beijing,xicheng,chuzu
1 beijing,chaoyang,chushou beijing,shijingshan,chushou
2 beijing,fengtai,chuzu NaN
------------------------------------------------------------------
data.k1.str.replace('beijing','shanghai')
->
0 shanghai,haidian,chushou
1 shanghai,chaoyang,chushou
2 shanghai,fengtai,chuzu
Name: k1, dtype: object
---------------------------------------------------------
data.k1.str.contains('chushou')
->
0 True
1 True
2 False
Name: k1, dtype: bool
-----------------------------------------------------------
data[data.k1.str.contains('chushou')]
->
k1 k2
0 beijing,haidian,chushou beijing,xicheng,chuzu
1 beijing,chaoyang,chushou beijing,shijingshan,chushou
-------------------------------------------------------
data.k1.str.split(',')
->
0 [beijing, haidian, chushou]
1 [beijing, chaoyang, chushou]
2 [beijing, fengtai, chuzu]
Name: k1, dtype: object
data.k1.str.replace('beijing','shanghai')
->
0 shanghai,haidian,chushou
1 shanghai,chaoyang,chushou
2 shanghai,fengtai,chuzu
Name: k1, dtype: object
---------------------------------------------------------
data.k1.str.contains('chushou')
->
0 True
1 True
2 False
Name: k1, dtype: bool
-----------------------------------------------------------
data[data.k1.str.contains('chushou')]
->
k1 k2
0 beijing,haidian,chushou beijing,xicheng,chuzu
1 beijing,chaoyang,chushou beijing,shijingshan,chushou
-------------------------------------------------------
data.k1.str.split(',')
->
0 [beijing, haidian, chushou]
1 [beijing, chaoyang, chushou]
2 [beijing, fengtai, chuzu]
Name: k1, dtype: object