import numpy as np
import pandas as pd
from pandas import Series,DataFrame
将ages按照“18到25”、“26到35”、“35到60”以及“60以上”进行划分
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
r = pd.cut(ages,bins)
r
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
codes返回划分组的编号
r.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
categories返回划分的组
r.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
closed='right',
dtype='interval[int64]')
pd.value_counts(r)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
pd.cut(ages,bins,labels=['Youth',' YoungAdult','MiddleAged','Senior'])
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [YoungAdult < MiddleAged < Senior < Youth]
pd.cut(ages,5)
[(19.959, 28.2], (19.959, 28.2], (19.959, 28.2], (19.959, 28.2], (19.959, 28.2], ..., (28.2, 36.4], (52.8, 61.0], (44.6, 52.8], (36.4, 44.6], (28.2, 36.4]]
Length: 12
Categories (5, interval[float64]): [(19.959, 28.2] < (28.2, 36.4] < (36.4, 44.6] < (44.6, 52.8] < (52.8, 61.0]]
pd.qcut(ages,4)
[(19.999, 22.75], (19.999, 22.75], (22.75, 29.0], (22.75, 29.0], (19.999, 22.75], ..., (29.0, 38.0], (38.0, 61.0], (38.0, 61.0], (38.0, 61.0], (29.0, 38.0]]
Length: 12
Categories (4, interval[float64]): [(19.999, 22.75] < (22.75, 29.0] < (29.0, 38.0] < (38.0, 61.0]]
自己设定分位数
pd.qcut(ages,[0,0.1,0.5,0.8,1.])
[(19.999, 21.1], (21.1, 29.0], (21.1, 29.0], (21.1, 29.0], (19.999, 21.1], ..., (29.0, 40.2], (40.2, 61.0], (40.2, 61.0], (40.2, 61.0], (29.0, 40.2]]
Length: 12
Categories (4, interval[float64]): [(19.999, 21.1] < (21.1, 29.0] < (29.0, 40.2] < (40.2, 61.0]]