函数或映射进行值替代
df = pd.DataFrame([['jeff',18]
,['herry',20]
,['chris',25]
,['culry',38]],columns=['name','age'])
df
|
name |
age |
0 |
jeff |
18 |
1 |
herry |
20 |
2 |
chris |
25 |
3 |
culry |
38 |
info = {'jeff':['dog',3]
,'herry':['cat',2]
,'chris':['cat',3]
,'culry':['cat',1]
}
df['pet'] = df['name'].map(lambda k:info[k][0])
df['pet_name'] = df['name'].map(lambda k: info[k][1])
df
|
name |
age |
pet |
pet_name |
0 |
jeff |
18 |
dog |
3 |
1 |
herry |
20 |
cat |
2 |
2 |
chris |
25 |
cat |
3 |
3 |
culry |
38 |
cat |
1 |
分箱
import numpy as np
import pandas as pd
ages = np.random.randint(4,100,30)
ages
array([60, 64, 98, 63, 73, 75, 62, 42, 43, 18, 70, 35, 32, 87, 4, 78, 78,
37, 61, 47, 95, 62, 54, 90, 41, 48, 29, 27, 61, 91])
按照指定的边界值来分箱
bins = [10,20,30,40,50,60,70,80,90,100]
cutdata = pd.cut(ages
,bins
,right=False
)
cutdata
[[60, 70), [60, 70), [90, 100), [60, 70), [70, 80), ..., [40, 50), [20, 30), [20, 30), [60, 70), [90, 100)]
Length: 30
Categories (9, interval[int64]): [[10, 20) < [20, 30) < [30, 40) < [40, 50) ... [60, 70) < [70, 80) < [80, 90) < [90, 100)]
cutdata.categories
IntervalIndex([(10, 20], (20, 30], (30, 40], (40, 50], (50, 60], (60, 70], (70, 80], (80, 90], (90, 100]],
closed='right',
dtype='interval[int64]')
cutdata = pd.cut(ages
,bins
,right=False
,labels=[str(i) for i in range(9)]
)
cutdata
['5', '5', '8', '5', '6', ..., '3', '1', '1', '5', '8']
Length: 30
Categories (9, object): ['0' < '1' < '2' < '3' ... '5' < '6' < '7' < '8']
cutdata.codes
array([ 4, 5, 8, 5, 6, 6, 5, 3, 3, 0, 5, 2, 2, 7, -1, 6, 6,
2, 5, 3, 8, 5, 4, 7, 3, 3, 1, 1, 5, 8], dtype=int8)
cutdata.value_counts()
(10, 20] 1
(20, 30] 2
(30, 40] 3
(40, 50] 5
(50, 60] 2
(60, 70] 7
(70, 80] 4
(80, 90] 2
(90, 100] 3
dtype: int64
按照指定的分位数进行分箱
import matplotlib.pyplot as plt
qcutdata = pd.qcut(ages,q=[0,0.25,0.5,0.75,1])
qcutdata
[(41.25, 61.0], (61.0, 74.5], (74.5, 98.0], (61.0, 74.5], (61.0, 74.5], ..., (41.25, 61.0], (3.999, 41.25], (3.999, 41.25], (41.25, 61.0], (74.5, 98.0]]
Length: 30
Categories (4, interval[float64]): [(3.999, 41.25] < (41.25, 61.0] < (61.0, 74.5] < (74.5, 98.0]]