pandas基本使用的练习笔记

1. 数据索引

import pandas as pd
left = pd.DataFrame( {
     'key':['k1', 'k2', 'k3', 'k4'],
                      'name':['n1', 'n2', 'n3', 'n4'],
                      'id':['id1', 'id2', 'id3', 'id4']
                     }
                   ) # 数组构成的字典

right = pd.DataFrame( {
     'key':['k221', 'k222', 'k223', 'k224'],
                      'name':['n221', 'n222', 'n223', 'n224'],
                      'id':['id221', 'id222', 'id223', 'id224']
                     }
                   )

print(left)
print(right)
    id key name
0  id1  k1   n1
1  id2  k2   n2
2  id3  k3   n3
3  id4  k4   n4
      id   key  name
0  id221  k221  n221
1  id222  k222  n222
2  id223  k223  n223
3  id224  k224  n224

A 选取行

# 1. 用数值取
left.loc[2]
id      id3
key      k3
name     n3
Name: 2, dtype: object
# 2. 用列表取
left.loc[ [1,2,3]] # 注意外层 【】
id key name
1 id2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
#3. 用切片
left.loc[ 1:2 ]
id key name
1 id2 k2 n2
2 id3 k3 n3
#4. 用bool做索引,先要得到bool series
mask = left['name'] > 'n1'
mask
0    False
1     True
2     True
3     True
Name: name, dtype: bool
#4.1 使用 bool series
left.loc[ mask]
id key name
1 id2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
# 5. iloc
left.iloc[3]
id      id4
key      k4
name     n4
Name: 3, dtype: object
# 5.1 iloc
left.iloc[ [0,1]]
id key name
0 id1 k1 n1
1 id2 k2 n2
# 5.2 iloc && 切片
left.iloc[ 0:2]
id key name
0 id1 k1 n1
1 id2 k2 n2
left.loc[ 0:2] # 注意区别,左闭有闭
id key name
0 id1 k1 n1
1 id2 k2 n2
2 id3 k3 n3
left.set_index('key')
id name
key
k1 id1 n1
k2 id2 n2
k3 id3 n3
k4 id4 n4
left.set_index('name')
id key
name
n1 id1 k1
n2 id2 k2
n3 id3 k3
n4 id4 k4
left.loc[ 0:2]
id key name
0 id1 k1 n1
1 id2 k2 n2
2 id3 k3 n3
left_reset_index = left.set_index("key")
left_reset_index
id name
key
k1 id1 n1
k2 id2 n2
k3 id3 n3
k4 id4 n4
left_reset_index.loc[ ['k1']]
id name
key
k1 id1 n1
left_reset_index.iloc[ [0,2]]
id name
key
k1 id1 n1
k3 id3 n3

B. 选取列

# 1. 列名
left[ 'name']
0    n1
1    n2
2    n3
3    n4
Name: name, dtype: object
# 2. 列表
left[ ['name', 'id']]
name id
0 n1 id1
1 n2 id2
2 n3 id3
3 n4 id4
# 3. 切片
left[ 0:2]  # 变成行了
id key name
0 id1 k1 n1
1 id2 k2 n2
# 4. bool
left[ left['name'] > 'n2']
id key name
2 id3 k3 n3
3 id4 k4 n4

C. 选取行和列

#1. 位置
left
left.loc[0, "key"]
'k1'
# 2. 列表
left.loc[ [1,2], ['name', 'id']]
name id
1 n2 id2
2 n3 id3
# 3. 切片
left.loc[ 0:2, :]
id key name
0 id1 k1 n1
1 id2 k2 n2
2 id3 k3 n3
# 4.iloc
left.iloc[ [0,1],[0,2]]
id name
0 id1 n1
1 id2 n2
left.iloc[ [0,1],[0,1]]
id key
0 id1 k1
1 id2 k2

基本操作

# 得到列名们
left.columns 
Index(['id', 'key', 'name'], dtype='object')
#修改列名
left.columns = ["xuhao", 'guanjianzhi', 'mingzi'] 
left
xuhao guanjianzhi mingzi
0 id1 k1 n1
1 id2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
# 获取的简单方式 。语法
left.xuhao
0    id1
1    id2
2    id3
3    id4
Name: xuhao, dtype: object
# 重命名列
left.rename(columns={
     "xuhao":"ID", "guanjianzhi":"KEY", "mingzi":"NAME"})

ID KEY NAME
0 id1 k1 n1
1 id2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
left
xuhao guanjianzhi mingzi
0 id1 k1 n1
1 id2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
# 就地修改 inplace 参数
left.rename( columns={
     "xuhao":"ID", "guanjianzhi":"KEY", "mingzi":"NAME"}, inplace=True)
left
ID KEY NAME
0 id1 k1 n1
1 id2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
# 修改元素值 字典的字典,外层字典用来指定改哪一列
left.replace( {
     "ID":{
     'id1':"ID1", 'id2':'ID2'}}, inplace=True);
left
ID KEY NAME
0 ID1 k1 n1
1 ID2 k2 n2
2 id3 k3 n3
3 id4 k4 n4

基本的排序和增删

# 有多少种取值可能
left.ID.unique()
array(['ID1', 'ID2', 'id3', 'id4'], dtype=object)
# 每一种取值的数量
left.ID.value_counts()
ID1    1
id3    1
ID2    1
id4    1
Name: ID, dtype: int64
# 排序
left.sort_values('ID')
ID KEY NAME
0 ID1 k1 n1
1 ID2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
# 多个key 排序
left.sort_values(['ID', 'NAME'], ascending=[True, False])
ID KEY NAME
0 ID1 k1 n1
1 ID2 k2 n2
2 id3 k3 n3
3 id4 k4 n4
left.sort_values( ['ID', "NAME"], ascending=[False, False])
ID KEY NAME
3 id4 k4 n4
2 id3 k3 n3
1 ID2 k2 n2
0 ID1 k1 n1
left.replace( {
     "ID":{
     'id3':'ID3', 'ID2':'id2'}}, inplace=True)
left
ID KEY NAME
0 ID1 k1 n1
1 id2 k2 n2
2 ID3 k3 n3
3 id4 k4 n4
left.sort_values(['ID'], ascending=[True])
ID KEY NAME
0 ID1 k1 n1
2 ID3 k3 n3
1 id2 k2 n2
3 id4 k4 n4
left['mailema'] = 1
left
ID KEY NAME mailema
0 ID1 k1 n1 1
1 id2 k2 n2 1
2 ID3 k3 n3 1
3 id4 k4 n4 1
left['rongyuceshilie'] = 1
left
ID KEY NAME mailema rongyuceshilie
0 ID1 k1 n1 1 1
1 id2 k2 n2 1 1
2 ID3 k3 n3 1 1
3 id4 k4 n4 1 1
# 删除列
del left['rongyuceshilie']
left
ID KEY NAME mailema
0 ID1 k1 n1 1
1 id2 k2 n2 1
2 ID3 k3 n3 1
3 id4 k4 n4 1
# 删除行
ret = left.drop(labels=2)
print(left)
print(ret)
    ID KEY NAME  mailema
0  ID1  k1   n1        1
1  id2  k2   n2        1
2  ID3  k3   n3        1
3  id4  k4   n4        1
    ID KEY NAME  mailema
0  ID1  k1   n1        1
1  id2  k2   n2        1
3  id4  k4   n4        1
# 修改一列的数值 map 结合字典
left.NAME
ret = left.NAME.map({
     'n1':'NNN1','n2':"N2", 'n3':'NN3', 'n4':"NN4"})
print(ret)
print(left.NAME)
0    NNN1
1      N2
2     NN3
3     NN4
Name: NAME, dtype: object
0    n1
1    n2
2    n3
3    n4
Name: NAME, dtype: object
# map 修改一列 map结合函数
import numpy as np
left.NAME.map("hello my name is {}".format)
test_series = left.NAME
yy = pd.Series(['name_ext', np.nan])
print(yy)
test_series = test_series.append(yy)
test_series = test_series.map("my name is {}".format, na_action='ignore')
print(test_series)
0    name_ext
1         NaN
dtype: object
0          my name is n1
1          my name is n2
2          my name is n3
3          my name is n4
0    my name is name_ext
1                    NaN
dtype: object
from collections import Counter
counter = Counter()
counter['bar'] += 1
counter
Counter({'bar': 1})
left.min()
ID                ID1
KEY                k1
NAME               n1
mailema             1
rongyuceshilie      1
dtype: object
left.max()
ID                id4
KEY                k4
NAME               n4
mailema             1
rongyuceshilie      1
dtype: object
left.sum()
ID                ID1id2ID3id4
KEY                   k1k2k3k4
NAME                  n1n2n3n4
mailema                      4
rongyuceshilie               4
dtype: object
left.cumsum()
ID KEY NAME mailema rongyuceshilie
0 ID1 k1 n1 1 1
1 ID1id2 k1k2 n1n2 2 2
2 ID1id2ID3 k1k2k3 n1n2n3 3 3
3 ID1id2ID3id4 k1k2k3k4 n1n2n3n4 4 4

pandas 常用操作矩阵运算

# 最值的位置
df = pd.DataFrame(np.random.random( (5, 10)), columns = list('abcdefghij')) # 简写列名
df
a b c d e f g h i j
0 0.920187 0.468342 0.251023 0.012371 0.410920 0.823567 0.644462 0.971567 0.363926 0.125066
1 0.388891 0.017500 0.400027 0.907302 0.747884 0.850026 0.871119 0.361581 0.313365 0.517795
2 0.858049 0.818444 0.429782 0.953707 0.362399 0.597594 0.583865 0.363218 0.671535 0.887737
3 0.981348 0.884835 0.333632 0.930583 0.990436 0.264681 0.349709 0.938117 0.729409 0.371021
4 0.409097 0.270208 0.093192 0.985581 0.389709 0.891675 0.551112 0.897221 0.573714 0.854950
df.idxmax()
a    3
b    3
c    2
d    4
e    3
f    4
g    1
h    0
i    3
j    2
dtype: int64
df*10
a b c d e f g h i j
0 9.201869 4.683421 2.510227 0.123709 4.109205 8.235670 6.444622 9.715667 3.639265 1.250659
1 3.888908 0.174995 4.000269 9.073024 7.478841 8.500256 8.711186 3.615811 3.133648 5.177951
2 8.580488 8.184435 4.297818 9.537071 3.623993 5.975941 5.838652 3.632183 6.715348 8.877372
3 9.813483 8.848354 3.336317 9.305834 9.904355 2.646808 3.497094 9.381173 7.294089 3.710212
4 4.090968 2.702080 0.931915 9.855806 3.897086 8.916751 5.511120 8.972206 5.737144 8.549502
df*df
a b c d e f g h i j
0 0.846744 0.219344 0.063012 0.000153 0.168856 0.678263 0.415332 0.943942 0.132442 0.015641
1 0.151236 0.000306 0.160022 0.823198 0.559331 0.722544 0.758848 0.130741 0.098197 0.268112
2 0.736248 0.669850 0.184712 0.909557 0.131333 0.357119 0.340899 0.131928 0.450959 0.788077
3 0.963045 0.782934 0.111310 0.865985 0.980962 0.070056 0.122297 0.880064 0.532037 0.137657
4 0.167360 0.073012 0.008685 0.971369 0.151873 0.795084 0.303724 0.805005 0.329148 0.730940
df.dot(df.T)
0 1 2 3 4
0 3.483729 2.576565 3.018221 3.486335 2.975670
1 2.576565 3.672533 3.474288 3.405148 3.571698
2 3.018221 3.474288 4.700682 4.478348 4.018175
3 3.486335 3.405148 4.478348 5.446347 3.980907
4 2.975670 3.571698 4.018175 3.980907 4.336201
df.T.dot(df)
a b c d e f g h i j
a 2.864633 2.118905 1.120862 2.498977 2.111316 2.225693 2.001426 2.633968 1.983464 1.792030
b 2.118905 1.745446 0.796707 1.891951 1.483817 1.364820 1.253284 1.831142 1.525968 1.353505
c 1.120862 0.796707 0.527741 1.178257 0.924835 0.975005 0.929213 0.941231 0.802141 0.823518
d 2.498977 1.891951 1.178257 3.570262 2.335035 2.476474 2.223776 2.443766 2.173484 2.505874
e 2.111316 1.483817 0.924835 2.335035 1.992355 1.800352 1.689049 2.080087 1.573283 1.461012
f 2.225693 1.364820 0.975005 2.476474 1.800352 2.623065 2.204120 2.372892 1.672019 1.934186
g 2.001426 1.253284 0.929213 2.223776 1.689049 2.204120 1.941099 1.975726 1.470863 1.650903
h 2.633968 1.831142 0.941231 2.443766 2.080087 2.372892 1.975726 2.891679 1.909819 1.746318
i 1.983464 1.525968 0.802141 2.173484 1.573283 1.672019 1.470863 1.909819 1.542785 1.565043
j 1.792030 1.353505 0.823518 2.505874 1.461012 1.934186 1.650903 1.746318 1.565043 1.940427

你可能感兴趣的:(数据分析)