Series数据结构
import pandas as pd;
a = pd.Series( [1,2,3,4,5]);
a
0 1
1 2
2 3
3 4
4 5
dtype: int64
a = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e'], dtype=float);
a
a 1.0
b 2.0
c 3.0
d 4.0
e 5.0
dtype: float64
import numpy as np;
a = np.arange(5);
b = pd.Series(a);
print(b)
print(type(a))
0 0
1 1
2 2
3 3
4 4
dtype: int32
dic = {
'name':'Lee', 'sex':'man', 'age':18}
a = pd.Series(dic)
print(a)
age 18
name Lee
sex man
dtype: object
my_dict = {
'name':'xing', 'sex':'man', 'age':18};
a = pd.Series(my_dict, index = ['name', 'color'])
a
name xing
color NaN
dtype: object
a = pd.Series(5, [0, 1, 2])
a
0 5
1 5
2 5
dtype: int64
a = pd.Series([3], [0, 1, 2])
a
0 3
1 3
2 3
dtype: int64
DataFrame数据结构
a = np.random.randint(0, 10, (2,3))
df = pd.DataFrame(a, index=['a', 'b'], columns = ['x', 'y', 'z']);
df
population = {
'beijing':3434, 'shanghai':2343, 'guangzhou':11232};
s = pd.Series(population);
df = pd.DataFrame(s);
df
|
0 |
beijing |
3434 |
guangzhou |
11232 |
shanghai |
2343 |
type(df)
pandas.core.frame.DataFrame
df = pd.DataFrame(s, columns=['pop_num'])
df
|
pop_num |
beijing |
3434 |
guangzhou |
11232 |
shanghai |
2343 |
popu = {
'bj':9898, 'sh':89887, 'gz':11232}
df = pd.DataFrame({
'gdp':popu})
df
|
gdp |
bj |
9898 |
gz |
11232 |
sh |
89887 |
gdp = {
'bj':0.998, 'sh':0.889, 'gz':1.232}
df = pd.DataFrame({
'gdp':gdp, 'popu':popu})
df
|
gdp |
popu |
bj |
0.998 |
9898 |
gz |
1.232 |
11232 |
sh |
0.889 |
89887 |
df = pd.DataFrame({
'gdp': gdp, 'popu':popu, 'country':'China'})
df
|
country |
gdp |
popu |
bj |
China |
0.998 |
9898 |
gz |
China |
1.232 |
11232 |
sh |
China |
0.889 |
89887 |
pandas里面数据的属性
df = pd.DataFrame({
'gdp':gdp, 'popu':popu});
df
|
gdp |
popu |
bj |
0.998 |
9898 |
gz |
1.232 |
11232 |
sh |
0.889 |
89887 |
df.values
array([[ 9.98000000e-01, 9.89800000e+03],
[ 1.23200000e+00, 1.12320000e+04],
[ 8.89000000e-01, 8.98870000e+04]])
df = pd.DataFrame({
'gdp':gdp, 'popu':popu, 'country':"China"});
df
|
country |
gdp |
popu |
bj |
China |
0.998 |
9898 |
gz |
China |
1.232 |
11232 |
sh |
China |
0.889 |
89887 |
df.values
array([['China', 0.998, 9898],
['China', 1.232, 11232],
['China', 0.889, 89887]], dtype=object)
df.index
Index(['bj', 'gz', 'sh'], dtype='object')
df.columns
Index(['country', 'gdp', 'popu'], dtype='object')
df.shape
(3, 3)
df.dtypes
country object
gdp float64
popu int64
dtype: object
df.size
9
索引查找数据
df = pd.DataFrame({
'gdp':gdp, 'popu':popu});
df
|
gdp |
popu |
bj |
0.998 |
9898 |
gz |
1.232 |
11232 |
sh |
0.889 |
89887 |
df['gdp']
bj 0.998
gz 1.232
sh 0.889
Name: gdp, dtype: float64
df.gdp
bj 0.998
gz 1.232
sh 0.889
Name: gdp, dtype: float64
df.loc['sh']
gdp 0.889
popu 89887.000
Name: sh, dtype: float64
df.loc[ ['sh', 'bj']]
|
gdp |
popu |
sh |
0.889 |
89887 |
bj |
0.998 |
9898 |
df.loc[ 'bj':'gz']
|
gdp |
popu |
bj |
0.998 |
9898 |
gz |
1.232 |
11232 |
df.iloc[ 0]
gdp 0.998
popu 9898.000
Name: bj, dtype: float64
df.iloc[ [0, 2]]
|
gdp |
popu |
bj |
0.998 |
9898 |
sh |
0.889 |
89887 |
df.loc['sh', 'gdp']
0.88900000000000001
df.iloc[ 0, 1]
9898
df.values[0][1]
9898.0
df.iloc[ 1:, :]
|
gdp |
popu |
gz |
1.232 |
11232 |
sh |
0.889 |
89887 |
df.gdp > 0
bj True
gz True
sh True
Name: gdp, dtype: bool
df.gdp > 0.9
bj True
gz True
sh False
Name: gdp, dtype: bool
df.loc[ df.gdp>0.9]
|
gdp |
popu |
bj |
0.998 |
9898 |
gz |
1.232 |
11232 |
df[ df.gdp>0.9]
|
gdp |
popu |
bj |
0.998 |
9898 |
gz |
1.232 |
11232 |
DF里面的赋值
df.iloc[ 0, 1] = 0
df
|
gdp |
popu |
bj |
0.998 |
0 |
gz |
1.232 |
11232 |
sh |
0.889 |
89887 |
new_column = pd.Series(['010','020','0755'], index=['bj', 'sh','gz']);
new_column
bj 010
sh 020
gz 0755
dtype: object
df['tel'] = new_column
df
|
gdp |
popu |
tel |
bj |
0.998 |
0 |
010 |
gz |
1.232 |
11232 |
0755 |
sh |
0.889 |
89887 |
020 |
查看数据的基本特征
dates = pd.date_range('2020-1-1', periods=6)
dates
DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
'2020-01-05', '2020-01-06'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randint(0, 10, (6,5)), index = dates, columns=list('ABCDE'))
df
|
A |
B |
C |
D |
E |
2020-01-01 |
4 |
9 |
8 |
8 |
8 |
2020-01-02 |
4 |
5 |
5 |
8 |
7 |
2020-01-03 |
1 |
0 |
3 |
2 |
0 |
2020-01-04 |
9 |
5 |
9 |
6 |
0 |
2020-01-05 |
7 |
8 |
3 |
8 |
0 |
2020-01-06 |
4 |
3 |
1 |
0 |
8 |
df.describe()
|
A |
B |
C |
D |
E |
count |
6.000000 |
6.000000 |
6.000000 |
6.000000 |
6.000000 |
mean |
4.833333 |
5.000000 |
4.833333 |
5.333333 |
3.833333 |
std |
2.786874 |
3.286335 |
3.125167 |
3.502380 |
4.215052 |
min |
1.000000 |
0.000000 |
1.000000 |
0.000000 |
0.000000 |
25% |
4.000000 |
3.500000 |
3.000000 |
3.000000 |
0.000000 |
50% |
4.000000 |
5.000000 |
4.000000 |
7.000000 |
3.500000 |
75% |
6.250000 |
7.250000 |
7.250000 |
8.000000 |
7.750000 |
max |
9.000000 |
9.000000 |
9.000000 |
8.000000 |
8.000000 |
df.info()
DatetimeIndex: 6 entries, 2020-01-01 to 2020-01-06
Freq: D
Data columns (total 5 columns):
A 6 non-null int32
B 6 non-null int32
C 6 non-null int32
D 6 non-null int32
E 6 non-null int32
dtypes: int32(5)
memory usage: 168.0 bytes
df.head(1)
|
A |
B |
C |
D |
E |
2020-01-01 |
4 |
9 |
8 |
8 |
8 |
df.tail(2)
|
A |
B |
C |
D |
E |
2020-01-05 |
7 |
8 |
3 |
8 |
0 |
2020-01-06 |
4 |
3 |
1 |
0 |
8 |
df.T
|
2020-01-01 00:00:00 |
2020-01-02 00:00:00 |
2020-01-03 00:00:00 |
2020-01-04 00:00:00 |
2020-01-05 00:00:00 |
2020-01-06 00:00:00 |
A |
4 |
4 |
1 |
9 |
7 |
4 |
B |
9 |
5 |
0 |
5 |
8 |
3 |
C |
8 |
5 |
3 |
9 |
3 |
1 |
D |
8 |
8 |
2 |
6 |
8 |
0 |
E |
8 |
7 |
0 |
0 |
0 |
8 |
df.sort_index()
|
A |
B |
C |
D |
E |
2020-01-01 |
4 |
9 |
8 |
8 |
8 |
2020-01-02 |
4 |
5 |
5 |
8 |
7 |
2020-01-03 |
1 |
0 |
3 |
2 |
0 |
2020-01-04 |
9 |
5 |
9 |
6 |
0 |
2020-01-05 |
7 |
8 |
3 |
8 |
0 |
2020-01-06 |
4 |
3 |
1 |
0 |
8 |
df.sort_index(ascending=False)
|
A |
B |
C |
D |
E |
2020-01-06 |
4 |
3 |
1 |
0 |
8 |
2020-01-05 |
7 |
8 |
3 |
8 |
0 |
2020-01-04 |
9 |
5 |
9 |
6 |
0 |
2020-01-03 |
1 |
0 |
3 |
2 |
0 |
2020-01-02 |
4 |
5 |
5 |
8 |
7 |
2020-01-01 |
4 |
9 |
8 |
8 |
8 |
df.sort_index(axis=1, ascending=False)
|
E |
D |
C |
B |
A |
2020-01-01 |
8 |
8 |
8 |
9 |
4 |
2020-01-02 |
7 |
8 |
5 |
5 |
4 |
2020-01-03 |
0 |
2 |
3 |
0 |
1 |
2020-01-04 |
0 |
6 |
9 |
5 |
9 |
2020-01-05 |
0 |
8 |
3 |
8 |
7 |
2020-01-06 |
8 |
0 |
1 |
3 |
4 |
df.sort_values('B')
|
A |
B |
C |
D |
E |
2020-01-03 |
1 |
0 |
3 |
2 |
0 |
2020-01-06 |
4 |
3 |
1 |
0 |
8 |
2020-01-02 |
4 |
5 |
5 |
8 |
7 |
2020-01-04 |
9 |
5 |
9 |
6 |
0 |
2020-01-05 |
7 |
8 |
3 |
8 |
0 |
2020-01-01 |
4 |
9 |
8 |
8 |
8 |
df.sort_values(dates[0], axis=1)
|
A |
C |
D |
E |
B |
2020-01-01 |
4 |
8 |
8 |
8 |
9 |
2020-01-02 |
4 |
5 |
8 |
7 |
5 |
2020-01-03 |
1 |
3 |
2 |
0 |
0 |
2020-01-04 |
9 |
9 |
6 |
0 |
5 |
2020-01-05 |
7 |
3 |
8 |
0 |
8 |
2020-01-06 |
4 |
1 |
0 |
8 |
3 |
数据计算
a = pd.DataFrame([1, 2, 3])
a
a-2
b = pd.DataFrame([1,3,4])
a+b
a*b
b.T
a.dot(b.T)
|
0 |
1 |
2 |
0 |
1 |
3 |
4 |
1 |
2 |
6 |
8 |
2 |
3 |
9 |
12 |
a = pd.DataFrame(np.random.randint(0, 20, (2,2)), columns=['A', 'B'])
a
b = pd.DataFrame(np.random.randint(0, 20, (3,3)), columns = ['A', 'B', 'C'])
b
|
A |
B |
C |
0 |
9 |
5 |
17 |
1 |
9 |
12 |
16 |
2 |
0 |
13 |
4 |
a+b
|
A |
B |
C |
0 |
26.0 |
6.0 |
NaN |
1 |
13.0 |
23.0 |
NaN |
2 |
NaN |
NaN |
NaN |
a.add(b, fill_value=11111111)
|
A |
B |
C |
0 |
26.0 |
6.0 |
11111128.0 |
1 |
13.0 |
23.0 |
11111127.0 |
2 |
11111111.0 |
11111124.0 |
11111115.0 |
缺失值的处理
a = pd.DataFrame(np.arange(9).reshape(3,3))
a
|
0 |
1 |
2 |
0 |
0 |
1 |
2 |
1 |
3 |
4 |
5 |
2 |
6 |
7 |
8 |
a.iloc[ :2, 2] = np.NaN
a
|
0 |
1 |
2 |
0 |
0 |
1 |
NaN |
1 |
3 |
4 |
NaN |
2 |
6 |
7 |
8.0 |
a.dropna()
a
|
0 |
1 |
2 |
0 |
0 |
1 |
NaN |
1 |
3 |
4 |
NaN |
2 |
6 |
7 |
8.0 |
a.dropna()
a.dropna(axis=1)
a.dropna(axis=1, how='all')
|
0 |
1 |
2 |
0 |
0 |
1 |
NaN |
1 |
3 |
4 |
NaN |
2 |
6 |
7 |
8.0 |
a.fillna(999)
|
0 |
1 |
2 |
0 |
0 |
1 |
999.0 |
1 |
3 |
4 |
999.0 |
2 |
6 |
7 |
8.0 |
合并和对齐
a = pd.DataFrame(np.zeros((3,4)), columns=['a', 'b', 'c', 'd'])
a
|
a |
b |
c |
d |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
b = pd.DataFrame(np.zeros( (3,4)), columns=list('abcd'))
b
|
a |
b |
c |
d |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
pd.concat([a, b])
|
a |
b |
c |
d |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
pd.concat( [a, b], ignore_index=True)
|
a |
b |
c |
d |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
3 |
0.0 |
0.0 |
0.0 |
0.0 |
4 |
0.0 |
0.0 |
0.0 |
0.0 |
5 |
0.0 |
0.0 |
0.0 |
0.0 |
pd.concat( [a, b], axis=1)
|
a |
b |
c |
d |
a |
b |
c |
d |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
pd.concat( [a, b], axis=1, ignore_index=True)
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
a = pd.DataFrame(np.ones( (3,3)), index=[0, 1, 2], columns=list('abc'))
b = pd.DataFrame(np.ones( (3,3)), index=[2, 3, 4], columns = list('cde'))
pd.concat([a, b])
|
a |
b |
c |
d |
e |
0 |
1.0 |
1.0 |
1.0 |
NaN |
NaN |
1 |
1.0 |
1.0 |
1.0 |
NaN |
NaN |
2 |
1.0 |
1.0 |
1.0 |
NaN |
NaN |
2 |
NaN |
NaN |
1.0 |
1.0 |
1.0 |
3 |
NaN |
NaN |
1.0 |
1.0 |
1.0 |
4 |
NaN |
NaN |
1.0 |
1.0 |
1.0 |
pd.concat( [a, b], axis=1)
|
a |
b |
c |
c |
d |
e |
0 |
1.0 |
1.0 |
1.0 |
NaN |
NaN |
NaN |
1 |
1.0 |
1.0 |
1.0 |
NaN |
NaN |
NaN |
2 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
3 |
NaN |
NaN |
NaN |
1.0 |
1.0 |
1.0 |
4 |
NaN |
NaN |
NaN |
1.0 |
1.0 |
1.0 |
a = pd.DataFrame( np.ones( (3,4)), index=[0, 1, 2], columns=['a', 'b','c','d'])
a
|
a |
b |
c |
d |
0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
1.0 |
1.0 |
1.0 |
1.0 |
b = pd.Series([100, 100, 100, 100], index=list('abcd'))
b
a 100
b 100
c 100
d 100
dtype: int64
a.append(b, ignore_index=True)
|
a |
b |
c |
d |
0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
1.0 |
1.0 |
1.0 |
1.0 |
3 |
100.0 |
100.0 |
100.0 |
100.0 |
a = pd.DataFrame([[-1, 1],
[-2, 0]], index=[1, 2], columns=["A", "B"]);
b = pd.DataFrame([[1, 11],
[0, 10]], index=[1, 2], columns= ['B', 'C']);
print(a)
print(b)
A B
1 -1 1
2 -2 0
B C
1 1 11
2 0 10
pd.merge( a,b)
|
A |
B |
C |
0 |
-1 |
1 |
11 |
1 |
-2 |
0 |
10 |
b = pd.DataFrame([[0, 20],
[1, 21]], index=[1,2], columns = ['B', 'C']);
b
a
pd.merge(a, b)
|
A |
B |
C |
0 |
-1 |
1 |
21 |
1 |
-2 |
0 |
20 |
分组
df = pd.DataFrame({
'key':list('ABCCBA'),
'data1':range(6),
'data2':range(20,26)
})
df
|
data1 |
data2 |
key |
0 |
0 |
20 |
A |
1 |
1 |
21 |
B |
2 |
2 |
22 |
C |
3 |
3 |
23 |
C |
4 |
4 |
24 |
B |
5 |
5 |
25 |
A |
groups = df.groupby('key')
groups
groups.sum()
|
data1 |
data2 |
key |
|
|
A |
5 |
45 |
B |
5 |
45 |
C |
5 |
45 |
groups.data1.sum()
key
A 5
B 5
C 5
Name: data1, dtype: int32
groups.median()
|
data1 |
data2 |
key |
|
|
A |
2.5 |
22.5 |
B |
2.5 |
22.5 |
C |
2.5 |
22.5 |
groups['data1'].mean()
key
A 2.5
B 2.5
C 2.5
Name: data1, dtype: float64
groups.apply(lambda x:x['data1']/x['data1'].sum())
key
A 0 0.0
5 1.0
B 1 0.2
4 0.8
C 2 0.4
3 0.6
Name: data1, dtype: float64
def func(x):
x['data1'] /= x['data1'].sum()
return x
groups.apply(func)
|
data1 |
data2 |
0 |
0.0 |
20 |
1 |
0.2 |
21 |
2 |
0.4 |
22 |
3 |
0.6 |
23 |
4 |
0.8 |
24 |
5 |
1.0 |
25 |
def func(x):
x['data1'] /= x['data1'].sum()
return x
df.groupby('key').apply(func)
|
data1 |
data2 |
key |
0 |
0.0 |
20 |
A |
1 |
0.2 |
21 |
B |
2 |
0.4 |
22 |
C |
3 |
0.6 |
23 |
C |
4 |
0.8 |
24 |
B |
5 |
1.0 |
25 |
A |
数据透视表
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
|
survived |
pclass |
sex |
age |
sibsp |
parch |
fare |
embarked |
class |
who |
adult_male |
deck |
embark_town |
alive |
alone |
0 |
0 |
3 |
male |
22.0 |
1 |
0 |
7.2500 |
S |
Third |
man |
True |
NaN |
Southampton |
no |
False |
1 |
1 |
1 |
female |
38.0 |
1 |
0 |
71.2833 |
C |
First |
woman |
False |
C |
Cherbourg |
yes |
False |
2 |
1 |
3 |
female |
26.0 |
0 |
0 |
7.9250 |
S |
Third |
woman |
False |
NaN |
Southampton |
yes |
True |
3 |
1 |
1 |
female |
35.0 |
1 |
0 |
53.1000 |
S |
First |
woman |
False |
C |
Southampton |
yes |
False |
4 |
0 |
3 |
male |
35.0 |
0 |
0 |
8.0500 |
S |
Third |
man |
True |
NaN |
Southampton |
no |
True |
titanic.pivot_table('survived', index='sex', columns='class')
class |
First |
Second |
Third |
sex |
|
|
|
female |
0.968085 |
0.921053 |
0.500000 |
male |
0.368852 |
0.157407 |
0.135447 |