import pandas as pd
import numpy as np
from datetime import datetime
company=["A","B","C"]
data=pd.DataFrame({
"company":[company[x] for x in np.random.randint(0,len(company),10)],
"salary":np.random.randint(5,50,10),
"age":np.random.randint(15,50,10)
}
)
data
#company salary age
#0 A 32 18
#1 A 30 29
#2 B 34 38
#3 C 44 37
#4 B 30 31
#5 C 28 19
#6 A 44 26
#7 A 6 34
#8 B 48 18
#9 A 37 33
group = data.groupby("company")
group
#
list(group)
#[('A',
# company salary age
# 0 A 32 18
# 1 A 30 29
# 6 A 44 26
# 7 A 6 34
# 9 A 37 33),
# ('B',
# company salary age
# 2 B 34 38
# 4 B 30 31
# 8 B 48 18),
# ('C',
# company salary age
# 3 C 44 37
# 5 C 28 19)]
– 转换成列表的形式后,可以看到,列表由三个元组组成,每个元组中,第一个元素是组别(这里是按照 company 进行分组,所以最后分为了 A,B,C),第二个元素的是对应组别下的 DataFrame,整个过程可以图解如下:
函数 | 用途 |
---|---|
min | 最小值 |
max | 最大值 |
sum | 求和 |
mean | 均值 |
median | 中位数 |
std | 标准差 |
var | 方差 |
count | 计数 |
data.groupby("company").agg('mean')
salary age
#company
#A 29.800000 28.0
#B 37.333333 29.0
#C 36.000000 28.0
data.groupby('company').agg({'salary':'median','age':'mean'})
salary age
#company
#A 32 28
#B 34 29
#C 36 28
avg_salary_dict= data.groupby('company')['salary'].mean().to_dict()
avg_salary_dict
#{'A': 29.8, 'B': 37.333333333333336, 'C': 36.0}
data['avg_salary'] = data['company'].map(avg_salary_dict)
data
#company salary age avg_salary
#0 A 32 18 29.800000
#1 A 30 29 29.800000
#2 B 34 38 37.333333
#3 C 44 37 36.000000
#4 B 30 31 37.333333
#5 C 28 19 36.000000
#6 A 44 26 29.800000
#7 A 6 34 29.800000
#8 B 48 18 37.333333
#9 A 37 33 29.800000
data['avg_salary1'] = data.groupby('company')['salary'].transform('mean')
data
#company salary age avg_salary avg_salary1
#0 A 32 18 29.800000 29.800000
#1 A 30 29 29.800000 29.800000
#2 B 34 38 37.333333 37.333333
#3 C 44 37 36.000000 36.000000
#4 B 30 31 37.333333 37.333333
#5 C 28 19 36.000000 36.000000
#6 A 44 26 29.800000 29.800000
#7 A 6 34 29.800000 29.800000
#8 B 48 18 37.333333 37.333333
#9 A 37 33 29.800000 29.800000
def get_oldest_staff(x):
df = x.sort_values(by = 'age',ascending=True)
return df.iloc[-1,:]
oldest_staff = data.groupby('company',as_index=False).apply(get_oldest_staff)
oldest_staff
#company salary age avg_salary avg_salary1
#company
#A A 6 34 29.800000 29.800000
#B B 34 38 37.333333 37.333333
#C C 44 37 36.000000 36.000000
np.nan == np.nan
#False
type(np.nan)
#float
pd.Series([1,np.nan,3]).dtype
#dtype('float64')
s_time = pd.Series([pd.Timestamp('20220101')]*3)
s_time
#0 2022-01-01
#1 2022-01-01
#2 2022-01-01
#dtype: datetime64[ns]
s_time[2] = np.nan
s_time
#0 2022-01-01
#1 2022-01-01
#2 NaT
#dtype: datetime64[ns]
None == None
#True
pd.Series([1,None])
#0 1.0
#1 NaN
#dtype: float64
s_new = pd.Series([1, 2], dtype="Int64")
s_new
#0 1
#1 2
#dtype: Int64
s_new[1] = pd.NA
s_new
#0 1
#1
#dtype: Int64
# 加法
print("pd.NA + 1 :\t", pd.NA + 1)
# 乘法
print('"a" * pd.NA:\t', "a" * pd.NA)
# 以下两种其中结果为1
print("pd.NA ** 0 :\t", pd.NA ** 0)
print("1 ** pd.NA:\t", 1 ** pd.NA)
##### 比较运算
print("pd.NA == pd.NA:\t", pd.NA == pd.NA)
print("pd.NA < 2.5:\t", pd.NA < 2.5)
print("np.add(pd.NA, 1):\t", np.add(pd.NA, 1))
df = pd.read_excel(r"data\data_test.xlsx")
我们可以使用 df.info() 来得知数据集各列的数据类型,是否为空值,内存占用情况。
df.info()
df.isnull()
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
"toy": [np.nan, 'Batmobile', 'Bullwhip'],
"born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]})
df
#name toy born
#0 Alfred NaN NaT
#1 Batman Batmobile 1940-04-25
#2 Catwoman Bullwhip NaT
df.dropna()
#name toy born
#1 Batman Batmobile 1940-04-25
df.dropna(axis='columns')
#name
#0 Alfred
#1 Batman
#2 Catwoman
df.dropna(how='all')
#name toy born
#0 Alfred NaN NaT
#1 Batman Batmobile 1940-04-25
#2 Catwoman Bullwhip NaT
df.dropna(thresh=2)
#name toy born
#1 Batman Batmobile 1940-04-25
#2 Catwoman Bullwhip NaT
df.dropna(subset=['toy'])
#name toy born
#1 Batman Batmobile 1940-04-25
#2 Catwoman Bullwhip NaT
df.dropna(inplace=True)
df
#name toy born
#1 Batman Batmobile 1940-04-25
df.fillna(value=None,method=None,axis=None,inplace=False,limit=None,downcast=None)
df = pd.DataFrame([[np.nan, 2, np.nan, 0],[3, 4, np.nan, 1],[np.nan, np.nan, np.nan, np.nan],
[np.nan, 3, np.nan, 4]],columns=list("ABCD"))
df
#A B C D
#0 NaN 2.0 NaN 0.0
#1 3.0 4.0 NaN 1.0
#2 NaN NaN NaN NaN
#3 NaN 3.0 NaN 4.0
df.fillna(0)
#A B C D
#0 0.0 2.0 0.0 0.0
#1 3.0 4.0 0.0 1.0
#2 0.0 0.0 0.0 0.0
#3 0.0 3.0 0.0 4.0
df.fillna(method="ffill")
#A B C D
#0 NaN 2.0 NaN 0.0
#1 3.0 4.0 NaN 1.0
#2 3.0 4.0 NaN 1.0
#3 3.0 3.0 NaN 4.0
df.fillna(method="bfill")
A B C D
#0 3.0 2.0 NaN 0.0
#1 3.0 4.0 NaN 1.0
#2 NaN 3.0 NaN 4.0
#3 NaN 3.0 NaN 4.0
values = {"A": 0, "B": 1, "C": 2, "D": 3}
df.fillna(value=values)
#A B C D
#0 0.0 2.0 2.0 0.0
#1 3.0 4.0 2.0 1.0
#2 0.0 1.0 2.0 3.0
#3 0.0 3.0 2.0 4.0
df.fillna(0, limit=1)
#A B C D
#0 0.0 2.0 0.0 0.0
#1 3.0 4.0 NaN 1.0
#2 NaN 0.0 NaN 0.0
#3 NaN 3.0 NaN 4.0
df2 = pd.DataFrame(np.random.rand(4,4), columns=list("ABCE"))
df2
#A B C E
#0 0.475937 0.169003 0.789308 0.772291
#1 0.554005 0.033041 0.732128 0.052256
#2 0.477042 0.375870 0.757475 0.794198
#3 0.912261 0.366646 0.730202 0.231903
df.fillna(value=df2)
# A B C D
#0 0.475937 2.00000 0.789308 0.0
#1 3.000000 4.00000 0.732128 1.0
#2 0.477042 0.37587 0.757475 NaN
#3 0.912261 3.00000 0.730202 4.0