用到的工具为:jupyter
开发工具版本:python3
目录介绍:
1、pandas高级应用–数据合并
2、pandas高级应用–数据重塑和旋转
3、pandas高级应用–数据转化、清除重复数据
4、pandas高级应用–数据替换
5、pandas高级应用–数据拆分
下一篇地址:https://blog.csdn.net/sinat_30353259/article/details/80804935
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA
df1 = DataFrame({"key":["b","b","a","c","a","a","b"],
"data1":range(7)})
df2 = DataFrame({"key":["a","b","d"],
"data2":range(3)})
df1
打印结果:
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
df2
打印结果:
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
# 指定key这一列取交集
pd.merge(df1,df2,on='key')
打印结果:
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
#如果列名不相同,如何去交集
df3 = DataFrame({"lkey":["b","b","a","c","a","a","b"],
"data1":range(7)})
df4 = DataFrame({"rkey":["a","b","d"],
"data2":range(3)})
pd.merge(df3,df4,left_on='lkey',right_on='rkey')
打印结果:
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
#merge方法的各种连接方法
#左连接
#右连接
#内连接
#外连接
#外连接 --> 并集
pd.merge(df1,df2,on='key',how='outer')
打印结果:
data1 key data2
0 0.0 b 1.0
1 1.0 b 1.0
2 6.0 b 1.0
3 2.0 a 0.0
4 4.0 a 0.0
5 5.0 a 0.0
6 3.0 c NaN
7 NaN d 2.0
# 左连接 --> 以merge连结的左边数据集为标准,右边只取和左边有关联的,没关联的NAN值填充
pd.merge(df1,df2,on='key',how='left')
打印结果:
data1 key data2
0 0 b 1.0
1 1 b 1.0
2 2 a 0.0
3 3 c NaN
4 4 a 0.0
5 5 a 0.0
6 6 b 1.0
#以merge连结的右边数据集为标准,左边只取和右边有关联的,没关联的NAN值填充
pd.merge(df1,df2,on='key',how='right')
打印结果:
data1 key data2
0 0.0 b 1
1 1.0 b 1
2 6.0 b 1
3 2.0 a 0
4 4.0 a 0
5 5.0 a 0
6 NaN d 2
# Series数据的连接
s1 = Series([0,1],index=["a","b"])
s2 = Series([2,3,4],index=["c","d","e"])
s3 = Series([5,6],index=["f","g"])
result = pd.concat([s1,s2,s3])
result
打印结果:
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
#将多个Series拼接成一个DataFrame,即一个Series就是DataFrame的一列数据
df_concat = pd.concat([s1,s2,s3],axis=1)
df_concat
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
s4 = pd.concat([s1*5,s3]) #乘法是直接将Series的值进行乘法操作
s4
打印结果:
a 0
b 5
f 5
g 6
dtype: int64
pd.concat([s1,s4],axis=1)
打印结果:
0 1
a 0.0 0
b 1.0 5
f NaN 5
g NaN 6
# inner取交集
pd.concat([s1,s4],axis=1,join='inner')
打印结果:
0 1
a 0 0
b 1 5
# 利用concat生成层次化索引数据结构
result = pd.concat([s1,s2,s3],keys=["one","two","three"])
result
打印结果:
one a 0
b 1
two c 2
d 3
e 4
three f 5
g 6
dtype: int64
#获取指定的数据
result["one"]['a']
打印结果:
0
#合并重叠数据
a = Series([NA,2.5,NA,3.5,4.5,NA],index=list("fedcba"))
b = Series(np.arange(len(a)),dtype=np.float64,index=list("fedcba"))
pd.concat([a,b])
打印结果:
f NaN
e 2.5
d NaN
c 3.5
b 4.5
a NaN
f 0.0
e 1.0
d 2.0
c 3.0
b 4.0
a 5.0
dtype: float64
#用其中一个Series中的数据给另一个Series中的数据作为补丁
resultB = b[:-2]
resultB
打印结果:
f 0.0
e 1.0
d 2.0
c 3.0
dtype: float64
resultA = a[2:]
resultA
打印结果:
d NaN
c 3.5
b 4.5
a NaN
dtype: float64
resultB.combine_first(resultA)
打印结果:
a NaN
b 4.5
c 3.0
d 2.0
e 1.0
f 0.0
dtype: float64
# DataFrame利用combine_first进行数据补丁操作
df1 = DataFrame({"a":[1,NA,5,NA],
"b":[NA,2,NA,6],
"c":range(2,18,4)})
df2 = DataFrame({"a":[5,4,NA,3,7],
"b":[NA,3,4,6,8]})
df1
打印结果:
a b c
0 1.0 NaN 2
1 NaN 2.0 6
2 5.0 NaN 10
3 NaN 6.0 14
df2
打印结果:
a b
0 5.0 NaN
1 4.0 3.0
2 NaN 4.0
3 3.0 6.0
4 7.0 8.0
# 用df2的数据为df1中的数据打补丁
df1.combine_first(df2)
打印结果:
a b c
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 6.0 14.0
4 7.0 8.0 NaN
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA
#创建层次化索引
data = Series(np.random.randn(10),index= [list("aaabbbccdd"),[1,2,3,1,2,3,1,2,2,3]])
data
打印结果:
a 1 0.535624
2 -0.886595
3 -0.434961
b 1 0.709035
2 0.837770
3 0.065979
c 1 -0.542920
2 1.250756
d 2 0.466432
3 -1.113291
dtype: float64
# 将行索引(index)转换到列索引上(columns)
result = data.unstack()
result
打印结果:
1 2 3
a 0.535624 -0.886595 -0.434961
b 0.709035 0.837770 0.065979
c -0.542920 1.250756 NaN
d NaN 0.466432 -1.113291
# 将列索引(columns)转换到行索引(index)
result.stack()
打印结果:
a 1 0.535624
2 -0.886595
3 -0.434961
b 1 0.709035
2 0.837770
3 0.065979
c 1 -0.542920
2 1.250756
d 2 0.466432
3 -1.113291
dtype: float64
# DataFrame 中的行索引和列索引的重塑和转换
data = DataFrame(np.arange(6).reshape(2,3),
index=pd.Index(["上海","北京"],name="省份"),
columns=pd.Index([2011,2012,2013],name="年份"))
data
打印结果:
年份 2011 2012 2013
省份
上海 0 1 2
北京 3 4 5
#将DataFrame的列索引转化到行索引
result = data.stack()
result
打印结果:
省份 年份
上海 2011 0
2012 1
2013 2
北京 2011 3
2012 4
2013 5
dtype: int32
#将DataFrame的行索引转化为列索引
#unstack()默认转换的最内层的层次化索引
result.unstack()
打印结果:
年份 2011 2012 2013
省份
上海 0 1 2
北京 3 4 5
#第一种方法,转换的时候,指定层次化索引的名称
result.unstack("省份")
打印结果:
省份 上海 北京
年份
2011 0 3
2012 1 4
2013 2 5
#第二种方法,转换的时候,指定层次化的索引 0是result的第一列,1是后面一层
result.unstack(1)
打印结果:
年份 2011 2012 2013
省份
上海 0 1 2
北京 3 4 5
#在对DataFrame进行unstack操作时,作为旋转轴的级别将会成为结果中的最低级别
data = DataFrame(np.arange(6).reshape(2,3),
index=pd.Index(["Ohio","Colorado"],name="state"),
columns=pd.Index(["one","two","three"],name="nu mbers"))
data
打印结果:
numbers one two three
state
Ohio 0 1 2
Colorado 3 4 5
result = data.stack()
result
打印结果:
state nu mbers
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32
df = DataFrame({"left":result,
"right":result+5},
columns=pd.Index(["left","right"],name="side"))
df
打印结果:
side left right
state nu mbers
Ohio one 0 5
two 1 6
three 2 7
Colorado one 3 8
two 4 9
three 5 10
result = df.unstack("state")
result
打印结果:
side left right
state Ohio Colorado Ohio Colorado
nu mbers
one 0 3 5 8
two 1 4 6 9
three 2 5 7 10
s1=Series([0,1,2,3],index=list("abcd"))
s2 = Series([4,5,6],index=list("cde"))
#将s1和s2拼接成一个具有层次化索引的Series
result = pd.concat([s1,s2],keys=["one","two"])
result
打印结果:
one a 0
b 1
c 2
d 3
two c 4
d 5
e 6
dtype: int64
#将结果中的行索引变成列索引
tempResult = result.unstack(1)
tempResult
打印结果:
a b c d e
one 0.0 1.0 2.0 3.0 NaN
two NaN NaN 4.0 5.0 6.0
#全部还原,空值用NaN填充
tempResult.stack(dropna=False)
打印结果:
one a 0.0
b 1.0
c 2.0
d 3.0
e NaN
two a NaN
b NaN
c 4.0
d 5.0
e 6.0
dtype: float64
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA
data = DataFrame({"k1":["one"]*3+["two"]*4,
"k2":[1,1,2,3,3,4,4]})
data
打印结果:
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
#第一种方法,去重
#检测DataFrame中的每行数据是否为重复数据行
mask = data.duplicated()
mask
打印结果:
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
#通过花式索引去除重复的数据
data[~mask]
打印结果:
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
#第二种方法:去重
#通过DataFrame内置的drop_duplicates()方法去除重复的数据行.
#去除
data.drop_duplicates()
打印结果:
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
data["v1"] = range(7)
data
打印结果:
k1 k2 v1
0 one 1 0
1 one 1 1
2 one 2 2
3 two 3 3
4 two 3 4
5 two 4 5
6 two 4 6
# 只以k1这一列为标准去重
data.drop_duplicates(["k1"])
打印结果:
k1 k2 v1
0 one 1 0
3 two 3 3
#通过制定keep参数制定需要保留特定的重复数据
#keep="first" 保留重复数据第一次出现的行索引
#keep="last" 保留重复数据最后一次的行索引
#keep=False 只要有重复数据,就全部丢掉
data.drop_duplicates(["k1"],keep="last")
打印结果:
k1 k2 v1
2 one 2 2
6 two 4 6
data=DataFrame({'food':['bacon','pulled pork','bacon','Pastrami',
'corned beef','Bacon','pastrami','honey ham','nova lox'],
'ounces':[4,3,12,6,7.5,8,3,5,6]})
data
打印结果:
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
#定义一个字典,反映每一种食物所属的动物
meat_to_animal={
'bacon':'pig',
'pulled pork':'pig',
'pastrami':'cow',
'corned beef':'cow',
'honey ham':'pig',
'nova lox':'salmon'
}
meat_to_animal
打印结果:
{'bacon': 'pig',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon',
'pastrami': 'cow',
'pulled pork': 'pig'}
data["animal"]=data["food"].map(str.lower).map(meat_to_animal)
data
打印结果:
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
#使用lambda匿名函数
data["animal"] = data['food'].map(lambda x: meat_to_animal[x.lower()])
data
打印结果:
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA
series = Series([1,-999,2,-999,-1000,3])
series
打印结果:
0 1
1 -999
2 2
3 -999
4 -1000
5 3
dtype: int64
#单个数据替换
series.replace(-999,NA)
打印结果:
0 1
1 -999
2 2
3 -999
4 0
5 3
dtype: int64
#多个数据替换
series.replace([-999,-1000],NA)
打印结果:
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
#replace方法传入字典,针对不同的值,进行不同的替换
#第一种方法
series.replace({-999:NA,-1000:0})
打印结果:
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
#第二种方法
series.replace([-999,-1000],[NA,0])
打印结果:
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA
from matplotlib import pyplot as plt
age = [20,22,25,27,21,23,37,31,61,45,41,32]
#将所有年龄进行分组
bins = [18,25,35,60,100]
#使用pandas中的cut对年龄数据进行分组
cats = pd.cut(age,bins)
cats
打印结果:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
#调用pd.value_counts方法统计每个区间段的人数
pd.value_counts(cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
#区间属于那一行索引
cats.codes
打印结果:
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
#为分类出每一组年龄加上标签
group_names = ["Youth","YouthAdult","MiddleAged","senior"]
#用group_name中的值,把区间替换
personType = pd.cut(age,bins,labels=group_names)
personType
打印结果:
[Youth, Youth, Youth, YouthAdult, Youth, ..., YouthAdult, senior, MiddleAged, MiddleAged, YouthAdult]
Length: 12
Categories (4, object): [Youth < YouthAdult < MiddleAged < senior]
# 用一个直方图进行简单展示
plt.hist(personType)
(array([3., 0., 0., 5., 0., 0., 3., 0., 0., 1.]),
array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3. ]),
10 Patch objects>)