Pandas快速总结

Series数据结构

# 带标签的一列
import pandas as pd;

a = pd.Series( [1,2,3,4,5]);
a
0    1
1    2
2    3
3    4
4    5
dtype: int64
# 传入index
a = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e'], dtype=float);
a
a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
dtype: float64
# 从numpy    ndarray导入 
import numpy as np;

a = np.arange(5);
b = pd.Series(a);
print(b)
print(type(a))
0    0
1    1
2    2
3    3
4    4
dtype: int32

# 从dict产生
dic = {
     'name':'Lee', 'sex':'man', 'age':18}
a = pd.Series(dic)
print(a)
age      18
name    Lee
sex     man
dtype: object
# 手动传入一个索引的时候 以索引为准, 从字典中查找,找不到就直接NaN
my_dict = {
     'name':'xing', 'sex':'man', 'age':18};
a = pd.Series(my_dict, index = ['name', 'color'])
a
name     xing
color     NaN
dtype: object
# 索引的数量多于 数据内容的时候会自动填充
a = pd.Series(5, [0, 1, 2])
a
0    5
1    5
2    5
dtype: int64
a = pd.Series([3], [0, 1, 2])
a
0    3
1    3
2    3
dtype: int64

DataFrame数据结构

# 从numpy 导入
a = np.random.randint(0, 10, (2,3))
df = pd.DataFrame(a, index=['a', 'b'], columns = ['x', 'y', 'z']);
df
x y z
a 4 3 2
b 5 6 9
# 把Series变为DF
# 一维的字典不能直接转
population = {
     'beijing':3434, 'shanghai':2343, 'guangzhou':11232};
s = pd.Series(population);
df = pd.DataFrame(s);
df
0
beijing 3434
guangzhou 11232
shanghai 2343
type(df)
pandas.core.frame.DataFrame
# 还是从series导入,但是加上列名
df = pd.DataFrame(s, columns=['pop_num'])
df
pop_num
beijing 3434
guangzhou 11232
shanghai 2343
# 字典的字典就可以构建df了
popu = {
     'bj':9898, 'sh':89887, 'gz':11232}
df = pd.DataFrame({
     'gdp':popu})  # 字典的字典
df
gdp
bj 9898
gz 11232
sh 89887
gdp = {
     'bj':0.998, 'sh':0.889, 'gz':1.232}
df = pd.DataFrame({
     'gdp':gdp, 'popu':popu})
df
gdp popu
bj 0.998 9898
gz 1.232 11232
sh 0.889 89887
# 他会自动扩充
# 单一数值会被自动扩充
df = pd.DataFrame({
      'gdp': gdp, 'popu':popu, 'country':'China'})
df
country gdp popu
bj China 0.998 9898
gz China 1.232 11232
sh China 0.889 89887

pandas里面数据的属性

# values属性转为numpy的array数据
df = pd.DataFrame({
     'gdp':gdp, 'popu':popu});
df
gdp popu
bj 0.998 9898
gz 1.232 11232
sh 0.889 89887
df.values
array([[  9.98000000e-01,   9.89800000e+03],
       [  1.23200000e+00,   1.12320000e+04],
       [  8.89000000e-01,   8.98870000e+04]])
# values属性转为numpy的array数据
df = pd.DataFrame({
     'gdp':gdp, 'popu':popu, 'country':"China"});
df
country gdp popu
bj China 0.998 9898
gz China 1.232 11232
sh China 0.889 89887
df.values #计算速度更快
array([['China', 0.998, 9898],
       ['China', 1.232, 11232],
       ['China', 0.889, 89887]], dtype=object)
df.index
Index(['bj', 'gz', 'sh'], dtype='object')
df.columns
Index(['country', 'gdp', 'popu'], dtype='object')
df.shape
(3, 3)
df.dtypes
country     object
gdp        float64
popu         int64
dtype: object
df.size
9

索引查找数据

df = pd.DataFrame({
     'gdp':gdp, 'popu':popu});
df
gdp popu
bj 0.998 9898
gz 1.232 11232
sh 0.889 89887
# 取一列
df['gdp']
bj    0.998
gz    1.232
sh    0.889
Name: gdp, dtype: float64
df.gdp # 对于上面方法的简写
bj    0.998
gz    1.232
sh    0.889
Name: gdp, dtype: float64
# 取一行
df.loc['sh']
gdp         0.889
popu    89887.000
Name: sh, dtype: float64
df.loc[ ['sh', 'bj']] #取多行需要传入列表
gdp popu
sh 0.889 89887
bj 0.998 9898
df.loc[ 'bj':'gz'] # 切片可以取到左闭右闭的索引的 一个表格
gdp popu
bj 0.998 9898
gz 1.232 11232
# 用位置拿去
df.iloc[ 0]
gdp        0.998
popu    9898.000
Name: bj, dtype: float64
df.iloc[ [0, 2]]
gdp popu
bj 0.998 9898
sh 0.889 89887
df.loc['sh', 'gdp'] #精确到一个cell
0.88900000000000001
# iloc 取一个cell
df.iloc[ 0, 1]
9898
# 专为 ndarray之后再取数
df.values[0][1]
9898.0
# 1: 1到最后  :表示所有的列
df.iloc[ 1:, :]
gdp popu
gz 1.232 11232
sh 0.889 89887
df.gdp > 0
bj    True
gz    True
sh    True
Name: gdp, dtype: bool
df.gdp > 0.9
bj     True
gz     True
sh    False
Name: gdp, dtype: bool
df.loc[ df.gdp>0.9] # 用bool变量来筛选
gdp popu
bj 0.998 9898
gz 1.232 11232
df[ df.gdp>0.9]
gdp popu
bj 0.998 9898
gz 1.232 11232

DF里面的赋值

df.iloc[ 0, 1] = 0 #修改cell
df
gdp popu
bj 0.998 0
gz 1.232 11232
sh 0.889 89887
new_column = pd.Series(['010','020','0755'], index=['bj', 'sh','gz']);
new_column
bj     010
sh     020
gz    0755
dtype: object
df['tel'] = new_column #增加一列
df
gdp popu tel
bj 0.998 0 010
gz 1.232 11232 0755
sh 0.889 89887 020

查看数据的基本特征

dates = pd.date_range('2020-1-1', periods=6)
dates
DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randint(0, 10, (6,5)), index = dates, columns=list('ABCDE'))
df
A B C D E
2020-01-01 4 9 8 8 8
2020-01-02 4 5 5 8 7
2020-01-03 1 0 3 2 0
2020-01-04 9 5 9 6 0
2020-01-05 7 8 3 8 0
2020-01-06 4 3 1 0 8
df.describe() # 对于每一列的描述,看整体结构
A B C D E
count 6.000000 6.000000 6.000000 6.000000 6.000000
mean 4.833333 5.000000 4.833333 5.333333 3.833333
std 2.786874 3.286335 3.125167 3.502380 4.215052
min 1.000000 0.000000 1.000000 0.000000 0.000000
25% 4.000000 3.500000 3.000000 3.000000 0.000000
50% 4.000000 5.000000 4.000000 7.000000 3.500000
75% 6.250000 7.250000 7.250000 8.000000 7.750000
max 9.000000 9.000000 9.000000 8.000000 8.000000
df.info() # 描述信息

DatetimeIndex: 6 entries, 2020-01-01 to 2020-01-06
Freq: D
Data columns (total 5 columns):
A    6 non-null int32
B    6 non-null int32
C    6 non-null int32
D    6 non-null int32
E    6 non-null int32
dtypes: int32(5)
memory usage: 168.0 bytes
df.head(1) # 前几行
A B C D E
2020-01-01 4 9 8 8 8
df.tail(2)
A B C D E
2020-01-05 7 8 3 8 0
2020-01-06 4 3 1 0 8
df.T # 转置
2020-01-01 00:00:00 2020-01-02 00:00:00 2020-01-03 00:00:00 2020-01-04 00:00:00 2020-01-05 00:00:00 2020-01-06 00:00:00
A 4 4 1 9 7 4
B 9 5 0 5 8 3
C 8 5 3 9 3 1
D 8 8 2 6 8 0
E 8 7 0 0 0 8
df.sort_index() #默认 按照行索引升序
A B C D E
2020-01-01 4 9 8 8 8
2020-01-02 4 5 5 8 7
2020-01-03 1 0 3 2 0
2020-01-04 9 5 9 6 0
2020-01-05 7 8 3 8 0
2020-01-06 4 3 1 0 8
df.sort_index(ascending=False) #默认 按照行索引排序
A B C D E
2020-01-06 4 3 1 0 8
2020-01-05 7 8 3 8 0
2020-01-04 9 5 9 6 0
2020-01-03 1 0 3 2 0
2020-01-02 4 5 5 8 7
2020-01-01 4 9 8 8 8
df.sort_index(axis=1, ascending=False) # 按照列索引 降序  按照索引排序
E D C B A
2020-01-01 8 8 8 9 4
2020-01-02 7 8 5 5 4
2020-01-03 0 2 3 0 1
2020-01-04 0 6 9 5 9
2020-01-05 0 8 3 8 7
2020-01-06 8 0 1 3 4
# 按照值排序
df.sort_values('B') #默认是按照某一列的值 对各个行排序
A B C D E
2020-01-03 1 0 3 2 0
2020-01-06 4 3 1 0 8
2020-01-02 4 5 5 8 7
2020-01-04 9 5 9 6 0
2020-01-05 7 8 3 8 0
2020-01-01 4 9 8 8 8
df.sort_values(dates[0], axis=1)
A C D E B
2020-01-01 4 8 8 8 9
2020-01-02 4 5 8 7 5
2020-01-03 1 3 2 0 0
2020-01-04 9 9 6 0 5
2020-01-05 7 3 8 0 8
2020-01-06 4 1 0 8 3

数据计算

a = pd.DataFrame([1, 2, 3])
a
0
0 1
1 2
2 3
a-2
0
0 -1
1 0
2 1
b = pd.DataFrame([1,3,4])
a+b
0
0 2
1 5
2 7
a*b
0
0 1
1 6
2 12
b.T
0 1 2
0 1 3 4
a.dot(b.T) #矩阵乘法
0 1 2
0 1 3 4
1 2 6 8
2 3 9 12
a = pd.DataFrame(np.random.randint(0, 20, (2,2)), columns=['A', 'B'])
a
A B
0 17 1
1 4 11
b = pd.DataFrame(np.random.randint(0, 20, (3,3)), columns = ['A', 'B', 'C'])
b
A B C
0 9 5 17
1 9 12 16
2 0 13 4
a+b # 有点地方就想加,没有的地方就为NaN
A B C
0 26.0 6.0 NaN
1 13.0 23.0 NaN
2 NaN NaN NaN
a.add(b, fill_value=11111111) # 先填充到 shape相同再计算
A B C
0 26.0 6.0 11111128.0
1 13.0 23.0 11111127.0
2 11111111.0 11111124.0 11111115.0

缺失值的处理

a = pd.DataFrame(np.arange(9).reshape(3,3))
a
0 1 2
0 0 1 2
1 3 4 5
2 6 7 8
a.iloc[ :2, 2] = np.NaN
a
0 1 2
0 0 1 NaN
1 3 4 NaN
2 6 7 8.0
# 丢掉 缺失值
a.dropna()
a
0 1 2
0 0 1 NaN
1 3 4 NaN
2 6 7 8.0
a.dropna() #按照行丢弃
0 1 2
2 6 7 8.0
a.dropna(axis=1) # 按照列丢弃
0 1
0 0 1
1 3 4
2 6 7
a.dropna(axis=1, how='all') # 全部缺失才丢弃
0 1 2
0 0 1 NaN
1 3 4 NaN
2 6 7 8.0
a.fillna(999) #帮我们修改
0 1 2
0 0 1 999.0
1 3 4 999.0
2 6 7 8.0

合并和对齐

a = pd.DataFrame(np.zeros((3,4)), columns=['a', 'b', 'c', 'd'])
a
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
b = pd.DataFrame(np.zeros( (3,4)), columns=list('abcd'))
b
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
# 合并,拼接
pd.concat([a, b]) # 需要传递一个列表进去
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
# 让pd帮我们重新索引
pd.concat( [a, b], ignore_index=True)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0
# 水平合并
pd.concat( [a, b], axis=1)
a b c d a b c d
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
pd.concat( [a, b], axis=1, ignore_index=True) # ignoreindex 会帮助我们废弃原来的不好用的列名
0 1 2 3 4 5 6 7
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
# shape不同的时候怎么办
a = pd.DataFrame(np.ones( (3,3)), index=[0, 1, 2], columns=list('abc'))
b = pd.DataFrame(np.ones( (3,3)), index=[2, 3, 4], columns = list('cde'))
pd.concat([a, b])
a b c d e
0 1.0 1.0 1.0 NaN NaN
1 1.0 1.0 1.0 NaN NaN
2 1.0 1.0 1.0 NaN NaN
2 NaN NaN 1.0 1.0 1.0
3 NaN NaN 1.0 1.0 1.0
4 NaN NaN 1.0 1.0 1.0
pd.concat( [a, b], axis=1)
a b c c d e
0 1.0 1.0 1.0 NaN NaN NaN
1 1.0 1.0 1.0 NaN NaN NaN
2 1.0 1.0 1.0 1.0 1.0 1.0
3 NaN NaN NaN 1.0 1.0 1.0
4 NaN NaN NaN 1.0 1.0 1.0
# 增加一行的办法
a = pd.DataFrame( np.ones( (3,4)), index=[0, 1, 2], columns=['a', 'b','c','d'])
a
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
b = pd.Series([100, 100, 100, 100], index=list('abcd'))
b
a    100
b    100
c    100
d    100
dtype: int64
a.append(b, ignore_index=True) # 添加一行
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
3 100.0 100.0 100.0 100.0
# 对齐 用merge方法,,会通过调整行的上下, 根据相同的‘列’保证其数值不变,
a = pd.DataFrame([[-1, 1], 
                 [-2,  0]], index=[1, 2], columns=["A", "B"]);
b = pd.DataFrame([[1, 11], 
                 [0,   10]], index=[1, 2], columns= ['B', 'C']);
print(a)
print(b)
   A  B
1 -1  1
2 -2  0
   B   C
1  1  11
2  0  10
pd.merge( a,b)
A B C
0 -1 1 11
1 -2 0 10
b = pd.DataFrame([[0, 20], 
                  [1, 21]], index=[1,2], columns = ['B', 'C']);
b
B C
1 0 20
2 1 21
a
A B
1 -1 1
2 -2 0
pd.merge(a, b)
A B C
0 -1 1 21
1 -2 0 20

分组

df = pd.DataFrame({
     
    'key':list('ABCCBA'),
    'data1':range(6), # range是python自带的 np.arange 是numpy里面的
    'data2':range(20,26)
})
df
data1 data2 key
0 0 20 A
1 1 21 B
2 2 22 C
3 3 23 C
4 4 24 B
5 5 25 A
groups = df.groupby('key')
groups

groups.sum() # 每一组的sum
data1 data2
key
A 5 45
B 5 45
C 5 45
groups.data1.sum()  # 针对其中一列 sum
key
A    5
B    5
C    5
Name: data1, dtype: int32
groups.median()
data1 data2
key
A 2.5 22.5
B 2.5 22.5
C 2.5 22.5
groups['data1'].mean()#Seies类型的返回值
key
A    2.5
B    2.5
C    2.5
Name: data1, dtype: float64
groups.apply(lambda x:x['data1']/x['data1'].sum())
key   
A    0    0.0
     5    1.0
B    1    0.2
     4    0.8
C    2    0.4
     3    0.6
Name: data1, dtype: float64
def func(x):
    x['data1'] /= x['data1'].sum()
    return x

groups.apply(func)
data1 data2
0 0.0 20
1 0.2 21
2 0.4 22
3 0.6 23
4 0.8 24
5 1.0 25
def func(x):
    x['data1'] /= x['data1'].sum()
    return x

df.groupby('key').apply(func)  #归一化,,用这个把简单
data1 data2 key
0 0.0 20 A
1 0.2 21 B
2 0.4 22 C
3 0.6 23 C
4 0.8 24 B
5 1.0 25 A

数据透视表

import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
titanic.pivot_table('survived', index='sex', columns='class')#透视表
class First Second Third
sex
female 0.968085 0.921053 0.500000
male 0.368852 0.157407 0.135447

你可能感兴趣的:(数据分析)