(三篇长文让你玩6Pandas)数据分析入门_PART2常用工具包_CH02数据分析工具:Pandas__Part03(统计分析基础)

'''
【课程2.14】  数值计算和统计基础

常用数学、统计方法
 
'''
'\n【课程2.14】  数值计算和统计基础\n\n常用数学、统计方法\n \n'
# 基本参数:axis、skipna

import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':[4,5,3,1,2],
                 'key2':[1,2,np.nan,4,5],
                 'key3':[1,2,3,'j','k']},
                 index = ['a','b','c','d','e'])
#np.nan代表空值 类型为float64 有一个np.nan会把series dtype变为float64(最大元素决定series其他元素数据类型)
#有一个str会把series dtype变为obj(str)(最大元素决定series其他元素数据类型)


print(df)
print(df['key1'].dtype,type(df['key1']),df['key2'].dtype,df['key3'].dtype)
print('-------------')

m1=df.mean()
print(m1,type(m1))
print('单独统计一列:',df['key2'].mean())
print('-----')
# np.nan :空值
# .mean()计算均值
# 只统计数字列
# 可以通过索引单独统计一列

# axis参数:默认为0,以列来计算,axis=1,以行来计算,这里就按照行来汇总了
m2 = df.mean(axis=1)
print(m2)
print('-----')

m3 = df.mean(skipna=False)
print(m3)
print('-----')
# skipna参数:是否忽略NaN,默认True,如False,有NaN的列统计结果仍未NaN
   key1  key2 key3
a     4   1.0    1
b     5   2.0    2
c     3   NaN    3
d     1   4.0    j
e     2   5.0    k
int64  float64 object
-------------
key1    3.0
key2    3.0
dtype: float64 
单独统计一列: 3.0
-----
a    2.5
b    3.5
c    3.0
d    2.5
e    3.5
dtype: float64
-----
key1    3.0
key2    NaN
dtype: float64
-----
# 主要数学计算方法,可用于Series和DataFrame(1)
df = pd.DataFrame({'key1':np.arange(10),
                  'key2':np.random.rand(10)*10})
print(df)
print('-----')

print(df.count(),'→ count统计非Na值的数量\n')
print(df.min(),'→ min统计最小值\n',df['key2'].max(),'→ max统计最大值\n')
print(df.quantile(q=0.75),'→ quantile统计分位数,参数q确定位置\n')
print(df.sum(),'→ sum求和\n')
print(df.mean(),'→ mean求平均值\n')
print(df.median(),'→ median求算数中位数,50%分位数\n')
print(df.std(),'\n',df.var(),'→ std,var分别求标准差,方差\n')
print(df.skew(),'→ skew样本的偏度\n')
print(df.kurt(),'→ kurt样本的峰度\n')
   key1      key2
0     0  9.921147
1     1  2.593068
2     2  9.755248
3     3  1.887166
4     4  8.889063
5     5  4.960097
6     6  8.753965
7     7  6.241893
8     8  5.947936
9     9  1.549023
-----
key1    10
key2    10
dtype: int64 → count统计非Na值的数量

key1    0.000000
key2    1.549023
dtype: float64 → min统计最小值
 9.921147426759397 → max统计最大值

key1    6.750000
key2    8.855288
Name: 0.75, dtype: float64 → quantile统计分位数,参数q确定位置

key1    45.000000
key2    60.498607
dtype: float64 → sum求和

key1    4.500000
key2    6.049861
dtype: float64 → mean求平均值

key1    4.500000
key2    6.094914
dtype: float64 → median求算数中位数,50%分位数

key1    3.027650
key2    3.246534
dtype: float64 
 key1     9.166667
key2    10.539981
dtype: float64 → std,var分别求标准差,方差

key1    0.000000
key2   -0.220597
dtype: float64 → skew样本的偏度

key1   -1.200000
key2   -1.615525
dtype: float64 → kurt样本的峰度
# 主要数学计算方法,可用于Series和DataFrame(2)
#累加 cumsum 累乘cumprod 新产生一列
#cummax,cummin分别求累计最大值,累计最小值


df['key1_s'] = df['key1'].cumsum()
df['key2_s'] = df['key2'].cumsum()
print(df,'→ cumsum样本的累计和\n')

df['key1_p'] = df['key1'].cumprod()
df['key2_p'] = df['key2'].cumprod()
print(df,'→ cumprod样本的累计积\n')

df['key1_max']=df['key1'].cummax()
df['key2_min']=df['key2'].cummin()
print(df,'→ cumprod样本的累计max min\n')
   key1      key2  key1_s     key2_s
0     0  9.921147       0   9.921147
1     1  2.593068       1  12.514216
2     2  9.755248       3  22.269464
3     3  1.887166       6  24.156630
4     4  8.889063      10  33.045693
5     5  4.960097      15  38.005790
6     6  8.753965      21  46.759755
7     7  6.241893      28  53.001648
8     8  5.947936      36  58.949583
9     9  1.549023      45  60.498607 → cumsum样本的累计和

   key1      key2  key1_s     key2_s  key1_p        key2_p
0     0  9.921147       0   9.921147       0  9.921147e+00
1     1  2.593068       1  12.514216       0  2.572621e+01
2     2  9.755248       3  22.269464       0  2.509656e+02
3     3  1.887166       6  24.156630       0  4.736138e+02
4     4  8.889063      10  33.045693       0  4.209983e+03
5     5  4.960097      15  38.005790       0  2.088192e+04
6     6  8.753965      21  46.759755       0  1.827996e+05
7     7  6.241893      28  53.001648       0  1.141016e+06
8     8  5.947936      36  58.949583       0  6.786688e+06
9     9  1.549023      45  60.498607       0  1.051274e+07 → cumprod样本的累计积

   key1      key2  key1_s     key2_s  key1_p        key2_p  key1_max  key2_min
0     0  9.921147       0   9.921147       0  9.921147e+00         0  9.921147
1     1  2.593068       1  12.514216       0  2.572621e+01         1  2.593068
2     2  9.755248       3  22.269464       0  2.509656e+02         2  2.593068
3     3  1.887166       6  24.156630       0  4.736138e+02         3  1.887166
4     4  8.889063      10  33.045693       0  4.209983e+03         4  1.887166
5     5  4.960097      15  38.005790       0  2.088192e+04         5  1.887166
6     6  8.753965      21  46.759755       0  1.827996e+05         6  1.887166
7     7  6.241893      28  53.001648       0  1.141016e+06         7  1.887166
8     8  5.947936      36  58.949583       0  6.786688e+06         8  1.887166
9     9  1.549023      45  60.498607       0  1.051274e+07         9  1.549023 → cumprod样本的累计max min
# 唯一值:.unique()

s = pd.Series(list('asdvasdcfgg'))
sq = s.unique()
print(s)
print('-----------')
print(sq,type(sq))
#Series.unique()默认生成无重复元素的ndarry
print('-----------')
print(pd.Series(sq))
print('-----------')
# 得到一个唯一值数组
# 通过pd.Series重新变成新的Series

sq.sort()
print(sq)
# 重新排序
0     a
1     s
2     d
3     v
4     a
5     s
6     d
7     c
8     f
9     g
10    g
dtype: object
-----------
['a' 's' 'd' 'v' 'c' 'f' 'g'] 
-----------
0    a
1    s
2    d
3    v
4    c
5    f
6    g
dtype: object
-----------
['a' 'c' 'd' 'f' 'g' 's' 'v']
# 值计数:.value_counts()
sc=s.value_counts(sort=False)#也可以这样写:pd.value_counts(sc, sort = False)
print(sc)
# 得到一个新的Series,计算出不同值出现的频率
# sort参数:排序,默认为True
g    2
s    2
v    1
c    1
a    2
f    1
d    2
dtype: int64
s=pd.Series(np.arange(10,15))
df = pd.DataFrame({'key1':list('asdcbvasd'),
                  'key2':np.arange(4,13)})
print(s)
print(df)
print('-----')

print(s.isin([5,14]))
print(df.isin(['a','bc','10',8]))
# 用[]表示
# 得到一个布尔值的Series或者Dataframe,【】的元素True,非【】的元素False
0    10
1    11
2    12
3    13
4    14
dtype: int32
  key1  key2
0    a     4
1    s     5
2    d     6
3    c     7
4    b     8
5    v     9
6    a    10
7    s    11
8    d    12
-----
0    False
1    False
2    False
3    False
4     True
dtype: bool
    key1   key2
0   True  False
1  False  False
2  False  False
3  False  False
4  False   True
5  False  False
6   True  False
7  False  False
8  False  False
#作业answer

#作业1
df=pd.DataFrame(data=np.random.rand(5,2)*100,columns=['key1','key2'])
print(df)
print('------------')

#key1均值
print('key1 mean',df['key1'].mean())

#key1中位数
print('key1 median',df['key1'].median())

#累计和
df['k1_cs']=df['key1'].cumsum()
df['k2_cs']=df['key2'].cumsum()
print(df)
        key1       key2
0  43.967395  21.959440
1  79.342955  15.192045
2  84.252771  55.023286
3  34.573707  24.446164
4  44.574339  21.046765
------------
key1 mean 57.34223325511027
key1 median 44.574338642560406
        key1       key2       k1_cs       k2_cs
0  43.967395  21.959440   43.967395   21.959440
1  79.342955  15.192045  123.310350   37.151485
2  84.252771  55.023286  207.563121   92.174771
3  34.573707  24.446164  242.136828  116.620935
4  44.574339  21.046765  286.711166  137.667700
def f(n):
    if len(n.unique())==len(n):
        print('是')
    else:
        print('否')
str=input('请输入一组元素 English,隔开')
lst=str.split(',')
s=pd.Series(lst)
f(s)
请输入一组元素 English,隔开1
是
'''
总结2.14
1 主要数学计算方法()里的两个参数axis、skipna
2主要数学计算方法count() mean()median()
3DF一列累计和/积/max/min
4判断series/DF元素唯一 series.unique()
5series/DF值计数series.value_counts()
6series/DF成员运算s.isin([5,14] 产生True值series/DF
'''
'\n总结2.14\n1 主要数学计算方法()里的两个参数axis、skipna\n2主要数学计算方法count() mean()median()\n3DF一列累计和/积/max/min\n4判断series/DF元素唯一 series.unique()\n5series/DF值计数series.value_counts()\n6series/DF成员运算s.isin([5,14] 产生True值series/DF\n'
'''
【课程2.15】  文本数据

Pandas针对字符串配备的一套方法,使其易于对数组的每个元素进行操作
 
'''
'\n【课程2.15】  文本数据\n\nPandas针对字符串配备的一套方法,使其易于对数组的每个元素进行操作\n \n'
# 通过str访问 Series即DF的一列(行)/index(colunms) of str而非单个str,且自动排除丢失/ NA值

s = pd.Series(['A','b','C','bbhello','123',np.nan,'hj'])
df = pd.DataFrame({'key1':list('abcdef'),
                  'key2':['hee','fv','w','hija','123',np.nan]})
print(s)
print(df)
print('-----')

print(s.str.count('b'))
print(df.loc[1].str.upper())
print('-----')
# 直接通过.str调用字符串方法
# 可以对Series、Dataframe使用
# 自动过滤NaN值

df.columns=df.columns.str.upper()
print(df)
# df.columns是一个Index对象,也可使用.str
0          A
1          b
2          C
3    bbhello
4        123
5        NaN
6         hj
dtype: object
  key1  key2
0    a   hee
1    b    fv
2    c     w
3    d  hija
4    e   123
5    f   NaN
-----
0    0.0
1    1.0
2    0.0
3    2.0
4    0.0
5    NaN
6    0.0
dtype: float64
key1     B
key2    FV
Name: 1, dtype: object
-----
  KEY1  KEY2
0    a   hee
1    b    fv
2    c     w
3    d  hija
4    e   123
5    f   NaN
# 字符串常用方法(1) - lower,upper,len,startswith,endswith
s = pd.Series(['A','b','bbhello','123',np.nan])

print(s.str.lower(),'→ lower小写\n')
print(s.str.upper(),'→ upper大写\n')
print(s.str.len(),'→ len字符长度\n')
print(s.str.startswith('b'),'→ 判断起始是否为a\n')
print(s.str.endswith('3'),'→ 判断结束是否为3\n')
0          a
1          b
2    bbhello
3        123
4        NaN
dtype: object → lower小写

0          A
1          B
2    BBHELLO
3        123
4        NaN
dtype: object → upper大写

0    1.0
1    1.0
2    7.0
3    3.0
4    NaN
dtype: float64 → len字符长度

0    False
1     True
2     True
3    False
4      NaN
dtype: object → 判断起始是否为a

0    False
1    False
2    False
3     True
4      NaN
dtype: object → 判断结束是否为3
# 字符串常用方法(2) - strip

s = pd.Series([' jack', 'jill ', ' jesse ', 'frank'])
df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '],
                  index=range(3))
print(s)
print(df)
print('-----')

print(s.str.strip())  # 去除字符串中的空格
print(s.str.lstrip())  # 去除字符串中的左空格
print(s.str.rstrip())  # 去除字符串中的右空格

df.columns = df.columns.str.strip()
print(df)
# 这里去掉了columns的前后空格,但没有去掉中间空格
0       jack
1      jill 
2     jesse 
3      frank
dtype: object
    Column A    Column B 
0   -1.199912    2.442613
1    0.852647    0.806449
2    0.181956    0.640792
-----
0     jack
1     jill
2    jesse
3    frank
dtype: object
0      jack
1     jill 
2    jesse 
3     frank
dtype: object
0      jack
1      jill
2     jesse
3     frank
dtype: object
   Column A  Column B
0 -1.199912  2.442613
1  0.852647  0.806449
2  0.181956  0.640792
# 字符串常用方法(3) - replace--可用来替换

df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '],
                  index=range(3))
df.columns = df.columns.str.replace(' ','-')
print(df)
# 替换

df.columns = df.columns.str.replace('-','hehe',n=1)
print(df)
# n:替换个数
   -Column-A-  -Column-B-
0   -1.152294    1.906471
1    1.380503    0.301456
2    1.474525   -0.612946
   heheColumn-A-  heheColumn-B-
0      -1.152294       1.906471
1       1.380503       0.301456
2       1.474525      -0.612946
# 字符串常用方法(4) - split、rsplit
s = pd.Series(['a,b,c','1,2,3',['a,,,c'],np.nan])
print(s.str.split(','))
print('-----')
# 类似字符串的split,只不过操作对象是series中全部str 而非一个str
#注意split只对list对象有用 若series的元素【’a,,,,c‘】本来就是list 则split无用

print(s.str.split(',')[0])
print('-----')
# 直接索引得到series的第一个元素list

print(s.str.split(',').str[0])
print(s.str.split(',').str.get(1))
print('-----')
# 可以使用get或[]符号访问拆分列表中的元素,获得series所有元素的第0/1个子元素形成的新series

print(s.str.split(',',expand=True))
print(s.str.split(',', expand=True, n = 1))
print(s.str.rsplit(',', expand=True, n = 1))
print('-----')
# 可以使用expand可以轻松扩展此操作以返回DataFrame
# n参数限制分割数 n=1分两列
# rsplit类似于split,反向工作,即从字符串的末尾到字符串的开头

df = pd.DataFrame({'key1':['a,b,c','1,2,3',[':,., ']],
                  'key2':['a-b-c','1-2-3',[':-.- ']]})
print(df)
print('-----')
print(df['key2'].str.split('-'))
# Dataframe的一列使用split
0    [a, b, c]
1    [1, 2, 3]
2          NaN
3          NaN
dtype: object
-----
['a', 'b', 'c']
-----
0      a
1      1
2    NaN
3    NaN
dtype: object
0      b
1      2
2    NaN
3    NaN
dtype: object
-----
     0    1    2
0    a    b    c
1    1    2    3
2  NaN  NaN  NaN
3  NaN  NaN  NaN
     0    1
0    a  b,c
1    1  2,3
2  NaN  NaN
3  NaN  NaN
     0    1
0  a,b    c
1  1,2    3
2  NaN  NaN
3  NaN  NaN
-----
      key1     key2
0    a,b,c    a-b-c
1    1,2,3    1-2-3
2  [:,., ]  [:-.- ]
-----
0    [a, b, c]
1    [1, 2, 3]
2          NaN
Name: key2, dtype: object
# 字符串索引

s = pd.Series(['A','b','C','bbhello','123',np.nan,'hj'])
df = pd.DataFrame({'key1':list('abcdef'),
                  'key2':['hee','fv','w','hija','123',np.nan]})

print(s.str[0])  # 取第一个字符串形成新series
print(s.str[:2])  # 取前两个字符串形成新series
print(df['key2'].str[0]) #提取key2的第一个字符形成新的df一列替换原来的df
0      A
1      b
2      C
3      b
4      1
5    NaN
6      h
dtype: object
0      A
1      b
2      C
3     bb
4     12
5    NaN
6     hj
dtype: object
0      h
1      f
2      w
3      h
4      1
5    NaN
Name: key2, dtype: object
#作业answer
df=pd.DataFrame({'gender':['  M  ','  M  ','  F  ','  M  ','  F  '],'name':['jack','tom','marry','zack','heheda']
                ,'score':['90-90-90','89-89-89','90-90-90','78-78-78','60-60-60']
                })
print(df)
print('--------')

df['name']=df['name'].str.capitalize()
print(df)
print('--------')

df['gender']=df['gender'].str.strip()
print(df)
print('--------')

#重点 弄懂两层str的含义
#df['math']=df['score'].str.split('-').str[0]
#df['english']=df['score'].str.split('-').str[1]
#df['art']=df['score'].str.split('-').str[2]
'''
df['math'] = df['score'].str.split('-', expand=True)[0]
df['english'] = df['score'].str.split('-', expand=True)[0]
df['art'] = df['score'].str.split('-', expand=True)[0]
第二种写法 一层一层拆并 expand
'''
df[['math','english','art']]=df['score'].str.split('-', expand=True)
#第三种写法 一次拆完并expand
del df['score']
print(df)
  gender    name     score
0    M      jack  90-90-90
1    M       tom  89-89-89
2    F     marry  90-90-90
3    M      zack  78-78-78
4    F    heheda  60-60-60
--------
  gender    name     score
0    M      Jack  90-90-90
1    M       Tom  89-89-89
2    F     Marry  90-90-90
3    M      Zack  78-78-78
4    F    Heheda  60-60-60
--------
  gender    name     score
0      M    Jack  90-90-90
1      M     Tom  89-89-89
2      F   Marry  90-90-90
3      M    Zack  78-78-78
4      F  Heheda  60-60-60
--------
  gender    name math english art
0      M    Jack   90      90  90
1      M     Tom   89      89  89
2      F   Marry   90      90  90
3      M    Zack   78      78  78
4      F  Heheda   60      60  60
#总结2.15
'''
1可以对A.df[] df.loc[]行/列(本质series);B.series;C.df.index or columns进行str的操作
2注意str操作是对series一竖列的string进行操作,而不是对一个str进行操作
3特别注意split方法
3.1 series.str.split[0] or series.str.get(0)
是获得新产生的series of list(split拆出)的第0个list

3.2 如果想获得series of list[0](单个string)
需要两层str series.str.split(此时为list)

3.3 series.str.split('-',expand=True,n=n0)将split的结果拆成n0+1列
注意作业三种写法

4注意str索引写法
series.str[:2] 得到series of string(原来string的前两个字母)
 

'''
"\n1可以对A.df[] df.loc[]行/列(本质series);B.series;C.df.index or columns进行str的操作\n2注意str操作是对series一竖列的string进行操作,而不是对一个str进行操作\n3特别注意split方法\n3.1 series.str.split[0] or series.str.get(0)\n是获得新产生的series of list(split拆出)的第0个list\n\n3.2 如果想获得series of list[0](单个string)\n需要两层str series.str.split(此时为list)\n\n3.3 series.str.split('-',expand=True,n=n0)将split的结果拆成n0+1列\n注意作业三种写法\n\n4注意str索引写法\nseries.str[:2] 得到series of string(原来string的前两个字母)\n \n\n"
'''
【课程2.16】  合并 merge、join

Pandas具有全功能的,高性能内存中连接操作,与SQL等关系数据库非常相似

pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)
 
'''
"\n【课程2.16】  合并 merge、join\n\nPandas具有全功能的,高性能内存中连接操作,与SQL等关系数据库非常相似\n\npd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,\n         left_index=False, right_index=False, sort=True,\n         suffixes=('_x', '_y'), copy=True, indicator=False)\n \n"
# merge合并 → 类似excel的vlookup
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                    'key2': ['K0', 'K1', 'K0', 'K1'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                    'key2': ['K0', 'K0', 'K0', 'K0'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']})
print(pd.merge(df1,df2,on='key'))
print('--------')
# merge合并 → 类似excel的vlookup

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                    'key2': ['K0', 'K1', 'K0', 'K1'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                    'key2': ['K0', 'K0', 'K0', 'K0'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']})
print(pd.merge(df1, df2, on='key'))
print('------')
# left:第一个df
# right:第二个df
# on:参考键

print(pd.merge(df3, df4, on=['key1','key2']))
# 多个链接键
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3
--------
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3
------
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
# 参数how → 合并方式

print(pd.merge(df3, df4,on=['key1','key2'], how = 'inner'))  
print('------')
# inner:默认,取交集

print(pd.merge(df3, df4, on=['key1','key2'], how = 'outer'))  
print('------')
# outer:取并集,数据缺失范围NaN

print(pd.merge(df3, df4, on=['key1','key2'], how = 'left'))  
print('------')
# left:按照df3为参考合并,数据缺失范围NaN

print(pd.merge(df3, df4, on=['key1','key2'], how = 'right'))  
# right:按照df4为参考合并,数据缺失范围NaN
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
------
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3
------
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN
------
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3
# 参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键
# 参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键

df1 = pd.DataFrame({'lkey':list('bbacaab'),
                   'data1':range(7)})
df2 = pd.DataFrame({'rkey':list('abd'),
                   'date2':range(3)})
print(pd.merge(df1, df2, left_on='lkey', right_on='rkey'))
print('------')
# df1以‘lkey’为键,df2以‘rkey’为键

df1 = pd.DataFrame({'key':list('abcdfeg'),
                   'data1':range(7)})
df2 = pd.DataFrame({'date2':range(100,105)},
                  index = list('abcde'))
print(pd.merge(df1, df2, left_on='key', right_index=True))
# df1以‘key’为键,df2以index为键
# left_index:为True时,第一个df以index为键,默认False
# right_index:为True时,第二个df以index为键,默认False

# 所以left_on, right_on, left_index, right_index可以相互组合:
# left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index
  lkey  data1 rkey  date2
0    b      0    b      1
1    b      1    b      1
2    b      6    b      1
3    a      2    a      0
4    a      4    a      0
5    a      5    a      0
------
  key  data1  date2
0   a      0    100
1   b      1    101
2   c      2    102
3   d      3    103
5   e      5    104
# 参数 sort

df1 = pd.DataFrame({'key':list('bbacaab'),
                   'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
                   'date2':[11,2,33]})
x1 = pd.merge(df1,df2, on = 'key', how = 'outer')
x2 = pd.merge(df1,df2, on = 'key', sort=True, how = 'outer')
print(x1)
print(x2)
print('------')
# sort:按照字典顺序通过 连接键 对结果DataFrame进行排序。默认为False,设置为False会大幅提高性能
#所以很少用sort 往往是merge后再将df sort_values

print(x2.sort_values('data1'))
# 也可直接用Dataframe的排序方法:sort_values,sort_index
  key  data1  date2
0   b    1.0    2.0
1   b    3.0    2.0
2   b    7.0    2.0
3   a    2.0   11.0
4   a    5.0   11.0
5   a    9.0   11.0
6   c    4.0    NaN
7   d    NaN   33.0
  key  data1  date2
0   a    2.0   11.0
1   a    5.0   11.0
2   a    9.0   11.0
3   b    1.0    2.0
4   b    3.0    2.0
5   b    7.0    2.0
6   c    4.0    NaN
7   d    NaN   33.0
------
  key  data1  date2
3   b    1.0    2.0
0   a    2.0   11.0
4   b    3.0    2.0
6   c    4.0    NaN
1   a    5.0   11.0
5   b    7.0    2.0
2   a    9.0   11.0
7   d    NaN   33.0
# pd.join() → 直接通过索引链接

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])

print(left)
print(right)
print(left.join(right))
print(left.join(right,how='outer'))
print('---------')
# 等价于:pd.merge(left, right, left_index=True, right_index=True, how='outer')

df1 = pd.DataFrame({'key':list('bbacaab'),
                   'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
                   'date2':[11,2,33]})
print(df1)
print(df2)
print(pd.merge(df1, df2, left_index=True, right_index=True))  
print(df1.join(df2['date2']))
print('-----')
# suffixes=('_name1', '_name2')更改新产生df 两个index 的_name 后缀名
#_x _y默认

left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                     'key': ['K0', 'K1', 'K0', 'K1']})
right = pd.DataFrame({'C': ['C0', 'C1'],
                      'D': ['D0', 'D1']},
                     index=['K0', 'K1'])
print(left)
print(right)
print(left.join(right, on = 'key'))
#left.join()括号里是left的参数,所以on是left的‘key’,而right还是index
# 等价于pd.merge(left, right, left_on='key', right_index=True, how='left', sort=False);
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3
     A   B    C    D
K0  A0  B0   C0   D0
K1  A1  B1  NaN  NaN
K2  A2  B2   C2   D2
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3
---------
  key  data1
0   b      1
1   b      3
2   a      2
3   c      4
4   a      5
5   a      9
6   b      7
  key  date2
0   a     11
1   b      2
2   d     33
  key_x  data1 key_y  date2
0     b      1     a     11
1     b      3     b      2
2     a      2     d     33
  key  data1  date2
0   b      1   11.0
1   b      3    2.0
2   a      2   33.0
3   c      4    NaN
4   a      5    NaN
5   a      9    NaN
6   b      7    NaN
-----
    A   B key
0  A0  B0  K0
1  A1  B1  K1
2  A2  B2  K0
3  A3  B3  K1
     C   D
K0  C0  D0
K1  C1  D1
    A   B key   C   D
0  A0  B0  K0  C0  D0
1  A1  B1  K1  C1  D1
2  A2  B2  K0  C0  D0
3  A3  B3  K1  C1  D1
#作业answer
#作业1
df1=pd.DataFrame({'key':['a','b','c'],
                   'values1':list(np.random.rand(3))})
df2=pd.DataFrame({'key':['b','c','d'],
                 'values2':list(np.random.rand(3))})
print('df1:\n',df1)
print('-----')
print('df2:\n',df2)
print('-----')
df3 =pd.merge(df1,df2,how='outer',on='key')
print('df3:\n',df3)
df1:
   key   values1
0   a  0.864474
1   b  0.788860
2   c  0.385425
-----
df2:
   key   values2
0   b  0.068776
1   c  0.620308
2   d  0.882386
-----
df3:
   key   values1   values2
0   a  0.864474       NaN
1   b  0.788860  0.068776
2   c  0.385425  0.620308
3   d       NaN  0.882386
#作业2
df1=pd.DataFrame({'lkey':['a','b','c'],
                   'values1':list(np.random.rand(3))})
df2=pd.DataFrame({'rkey':['b','c','d'],
                 'values2':list(np.random.rand(3))})
df3 =pd.merge(df1,df2,how='outer',left_index=True,right_index=True)
print('df1:\n',df1)
print('-----')
print('df2:\n',df2)
print('-----')
print('df3:\n',df3)
df1:
   lkey   values1
0    a  0.250315
1    b  0.672982
2    c  0.248125
-----
df2:
   rkey   values2
0    b  0.115970
1    c  0.181128
2    d  0.314268
-----
df3:
   lkey   values1 rkey   values2
0    a  0.250315    b  0.115970
1    b  0.672982    c  0.181128
2    c  0.248125    d  0.314268
df1=pd.DataFrame({'key':['a','b','c'],
                   'values1':list(np.random.rand(3))})
df2=pd.DataFrame({'values2':list(np.random.rand(3)),
                   'values3':list('567')}
                    ,index=list('bcd'))
df3=pd.merge(df1,df2,left_on='key',right_index=True)
print('df1:\n',df1)
print('-----')
print('df2:\n',df2)
print('-----')
print('df3:\n',df3)
df1:
   key   values1
0   a  0.433476
1   b  0.307434
2   c  0.546751
-----
df2:
     values2 values3
b  0.876000       5
c  0.629869       6
d  0.962068       7
-----
df3:
   key   values1   values2 values3
1   b  0.307434  0.876000       5
2   c  0.546751  0.629869       6
'''
总结2.16
1 pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)
 
记住参数套路:1.1 left right 表明需要合并的两个df
              1.2 how 表明合并方式 四种 inner(交) outer(并) left right  
              1.3 left_on, right_on, left_index, right_index可以相互组合:
              1.4 sort 将合并后的df按照合并标志排序,默认false不适用
              一般先合并好,然后用df.sort_values()/sort_index()方法
              1.5 suffixes 如果on='key',(即左右合并序列都是相同的‘key’)参数用来改变合并后的df 两个key的名字
              1.6 copy表示直接在原来的df上修改还是复制再改变,默认复制

2 df1.join(df2,df3 =pd.merge(df1,df2,how='outer',on='key of df1'))
              on:不写默认以两个df的index作为合并标志

'''
"\n总结2.16\n1 pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,\n         left_index=False, right_index=False, sort=True,\n         suffixes=('_x', '_y'), copy=True, indicator=False)\n \n记住参数套路:1.1 left right 表明需要合并的两个df\n              1.2 how 表明合并方式 四种 inner(交) outer(并) left right  \n              1.3 left_on, right_on, left_index, right_index可以相互组合:\n              1.4 sort 将合并后的df按照合并标志排序,默认false不适用\n              一般先合并好,然后用df.sort_values()/sort_index()方法\n              1.5 suffixes 如果on='key',(即左右合并序列都是相同的‘key’)参数用来改变合并后的df 两个key的名字\n              1.6 copy表示直接在原来的df上修改还是复制再改变,默认复制\n\n2 df1.join(df2,df3 =pd.merge(df1,df2,how='outer',on='key of df1'))\n              on:不写默认以两个df的index作为合并标志\n\n"
'''
【课程2.17】  连接与修补 concat、combine_first

连接 - 沿轴执行连接操作

pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)
 
'''
"\n【课程2.17】  连接与修补 concat、combine_first\n\n连接 - 沿轴执行连接操作\n\npd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,\n          keys=None, levels=None, names=None, verify_integrity=False,\n          copy=True)\n \n"
# 连接1:concat--series

s1 = pd.Series([1,2,3])
s2 = pd.Series([2,3,4])
s3 = pd.Series([1,2,3],index = ['a','c','h'])
s4 = pd.Series([2,3,4],index = ['b','e','d'])
print(pd.concat([s1,s2]))
print(pd.concat([s3,s4]).sort_index())
print('------')
#注意被concat的两个对象要放进[]list
# 默认axis=0,index3=index1+index2 ,行3=行1+行2 

print(pd.concat([s3,s4],axis=1))
print('-----')
# axis=1,index3=index1+index2,列3=列1 并列 列2,成为一个Dataframe,空值写为NaN
0    1
1    2
2    3
0    2
1    3
2    4
dtype: int64
a    1
b    2
c    2
d    4
e    3
h    3
dtype: int64
------
     0    1
a  1.0  NaN
b  NaN  2.0
c  2.0  NaN
d  NaN  4.0
e  NaN  3.0
h  3.0  NaN
-----


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:13: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  del sys.path[0]
#连接2:concat--DF
df1=pd.DataFrame({'lkey':['a','b','c'],
                   'values1':list(np.random.rand(3))})
df2=pd.DataFrame({'rkey':['b','c','d'],
                 'values2':list(np.random.rand(3))}
                ,index=list('xyz'))
df3=pd.concat([df1,df2],axis=0)
df4=pd.concat([df1,df2],axis=1)

print(df1)
print('-------↑df1')
print(df2)
print('-------↑df2')
print(df3)
print('-------↑df3')
print(df4)
print('-------↑df4')

#注 DataFrame里的axis
#df里不同index对应不同行,对应涵盖所有属性的一组数据
#df里不同columns对应不同列,对应不同的属性
#所以axis一般都=0 默认变化数据组数
#axis=1时才变化属性
  lkey   values1
0    a  0.939059
1    b  0.885028
2    c  0.758734
-------↑df1
  rkey   values2
x    b  0.343632
y    c  0.082145
z    d  0.436147
-------↑df2
  lkey rkey   values1   values2
0    a  NaN  0.939059       NaN
1    b  NaN  0.885028       NaN
2    c  NaN  0.758734       NaN
x  NaN    b       NaN  0.343632
y  NaN    c       NaN  0.082145
z  NaN    d       NaN  0.436147
-------↑df3
  lkey   values1 rkey   values2
0    a  0.939059  NaN       NaN
1    b  0.885028  NaN       NaN
2    c  0.758734  NaN       NaN
x  NaN       NaN    b  0.343632
y  NaN       NaN    c  0.082145
z  NaN       NaN    d  0.436147
-------↑df4


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  import sys
s5 = pd.Series([1,2,3],index = ['a','b','c'])
s6 = pd.Series([2,3,4],index = ['b','c','d'])
print(s5)
print('------')
print(s6)
print('------')

print(pd.concat([s5,s6],axis=1))
print('------')

print(pd.concat([s5,s6], axis= 1, join='inner'))
print('------')

print(pd.concat([s5,s6], axis= 1, join_axes=[['a','b','d']]))
# join:{'inner','outer'},默认为“outer”。如何处理其他轴上的索引。outer为联合和inner为交集。
# join_axes:只显示指定联合的index
a    1
b    2
c    3
dtype: int64
------
b    2
c    3
d    4
dtype: int64
------
     0    1
a  1.0  NaN
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0
------
   0  1
b  2  2
c  3  3
------
     0    1
a  1.0  NaN
b  2.0  2.0
d  NaN  4.0


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.
# 双重axis=0/覆盖列名axis=1 keys参数

sre=pd.concat([s5,s6],axis=0,keys=['one','two'])
print(sre,type(sre))
print(sre.index)
print('-----')
# keys:序列,默认值无。作为最外层构建层次横行索引(刚刚变化)

sre = pd.concat([s5,s6], axis=1, keys = ['one','two'])
print(sre,type(sre))
# axis = 1, 覆盖列名(刚刚变化)
one  a    1
     b    2
     c    3
two  b    2
     c    3
     d    4
dtype: int64 
MultiIndex(levels=[['one', 'two'], ['a', 'b', 'c', 'd']],
           codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 1, 2, 3]])
-----
   one  two
a  1.0  NaN
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0 


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  if __name__ == '__main__':
# 修补Nan值 pd.combine_first()

df1 = pd.DataFrame([[np.nan, 3., 5.], [-4.6, np.nan, np.nan],[np.nan, 7., np.nan]])
df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]],index=[1, 2])
print(df1)
print(df2)
print('--------')
print(df1.combine_first(df2))
print('--------')
#1根据index,df1上存在的数值不变,df1上的NaN空值被df2响应位置数值替代
#2如果df2的index多于df1,则更新到df1上

df1.update(df2)
print(df1)
# update,根据index位置,直接df2覆盖df1
     0    1    2
0  NaN  3.0  5.0
1 -4.6  NaN  NaN
2  NaN  7.0  NaN
      0    1    2
1 -42.6  NaN -8.2
2  -5.0  1.6  4.0
--------
     0    1    2
0  NaN  3.0  5.0
1 -4.6  NaN -8.2
2 -5.0  7.0  4.0
--------
      0    1    2
0   NaN  3.0  5.0
1 -42.6  NaN -8.2
2  -5.0  1.6  4.0
#作业答案
#作业1
df1=pd.DataFrame({'values1':list(np.random.rand(4)),'values2':list(np.random.rand(4))},index=list('abcd'))
df2=pd.DataFrame({'values1':list(np.random.rand(4)),'values2':list(np.random.rand(4))},index=list('efgh'))
df3=pd.concat([df1,df2],axis=0)
print(df1)
print('-------↑df1')
print(df2)
print('-------↑df2')
print(df3)
print('-------↑df3')
    values1   values2
a  0.503692  0.986015
b  0.142672  0.176473
c  0.007865  0.222326
d  0.064909  0.328113
-------↑df1
    values1   values2
e  0.385201  0.392069
f  0.318505  0.506359
g  0.395767  0.880248
h  0.061653  0.402304
-------↑df2
    values1   values2
a  0.503692  0.986015
b  0.142672  0.176473
c  0.007865  0.222326
d  0.064909  0.328113
e  0.385201  0.392069
f  0.318505  0.506359
g  0.395767  0.880248
h  0.061653  0.402304
-------↑df3
#作业2
df1 = pd.DataFrame(np.random.rand(4,2),
                  index = list('abcd'),
                  columns = ['values1','values2'])
df1['values1'].loc[['b','c']] = np.nan
print('创建df1为:\n',df1,'\n------')

df2 = pd.DataFrame(np.arange(8).reshape(4,2),
                  index = list('abcd'),
                  columns = ['values1','values2'])
print('创建df2为:\n',df2,'\n------')

df2.update(df1)
print('df1修补后为:\n',df2,'\n------')

#法二
#df3 = df1.combine_first(df2)
#print('df1修补后为:\n',df3,'\n------')
创建df1为:
     values1   values2
a  0.445917  0.289403
b       NaN  0.013630
c       NaN  0.676349
d  0.292880  0.862718 
------
创建df2为:
    values1  values2
a        0        1
b        2        3
c        4        5
d        6        7 
------
df1修补后为:
     values1   values2
a  0.445917  0.289403
b  2.000000  0.013630
c  4.000000  0.676349
d  0.292880  0.862718 
------
'''
总结2.17
1 pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)

1.1 concat是直接沿横/竖轴连接,merge是按照指定标志对其连接(只能横向)
1.2 objs用[df1,df2] list传递两个被连接对象
1.3 axis 默认=0变化横排数据组   =1时变化数列属性组
1.4join 指定axis以外的另一个没主动变化的轴如何变化 4种 inner outer df1 df2
1.5join_axes 指定axis以外的另一个没主动变化的轴如何保留
1.6keys axis=0时 双重索引(刚主动变化的数据组索引) axis=1时覆盖columns(刚主动变化的属性组名词)
1.7copy代表是否要复制变化

2  修补Nan值 
df1.combine_first(df2)
根据index,df1上存在的数值不变,df1上的NaN空值被df2响应位置数值替代
如果df2的index多于df1,则更新到df1上

3覆盖 
df1.update(df2)
 update,根据index位置,无论df1有没有值,df2所有非Nan的值覆盖df1
'''
"\n总结2.17\n1 pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,\n          keys=None, levels=None, names=None, verify_integrity=False,\n          copy=True)\n\n1.1 concat是直接沿横/竖轴连接,merge是按照指定标志对其连接(只能横向)\n1.2 objs用[df1,df2] list传递两个被连接对象\n1.3 axis 默认=0变化横排数据组   =1时变化数列属性组\n1.4join 指定axis以外的另一个没主动变化的轴如何变化 4种 inner outer df1 df2\n1.5join_axes 指定axis以外的另一个没主动变化的轴如何保留\n1.6keys axis=0时 双重索引(刚主动变化的数据组索引) axis=1时覆盖columns(刚主动变化的属性组名词)\n1.7copy代表是否要复制变化\n\n2  修补Nan值 \ndf1.combine_first(df2)\n根据index,df1上存在的数值不变,df1上的NaN空值被df2响应位置数值替代\n如果df2的index多于df1,则更新到df1上\n\n3覆盖 \ndf1.update(df2)\n update,根据index位置,无论df1有没有值,df2所有非Nan的值覆盖df1\n"
'''
【课程2.18】  去重及替换

.duplicated / .replace
 
'''
'\n【课程2.18】  去重及替换\n\n.duplicated / .replace\n \n'
# 对Series去重 series.duplicated
s = pd.Series([1,1,1,1,2,2,2,3,4,5,5,5,5])
print(s.duplicated())
print(s[s.duplicated()==False])
print('-----')
#s.duplicated()返回一个Series ,为每个元素赋予是否重复的标签,重复为T
#通过布尔索引 s[s.duplicated()==False]可以进行去重操作

s_re=s.drop_duplicates(inplace=False)
print(s_re)
print('-----')
#通过s.drop_duplicates()方法可以直接去重
#inplace=True 可以直接替换s 而返回none
0     False
1      True
2      True
3      True
4     False
5      True
6      True
7     False
8     False
9     False
10     True
11     True
12     True
dtype: bool
0    1
4    2
7    3
8    4
9    5
dtype: int64
-----
0    1
4    2
7    3
8    4
9    5
dtype: int64
-----
#对df去重 df.duplicated

df = pd.DataFrame({'key1':['a','a',3,4,5],
                  'key2':['a','a','b','b','c']})
print(df)
print('-----')

print(df.duplicated())
print('-----')

print(df[df.duplicated()==False])
print('-----')

print(df.drop_duplicates())
print('-----')

print(df['key2'].duplicated())
print('-----')
# df.duplicated 判断是否重复的标签变成了标签组(所有columns的值),重复T
#同样可以通过bool索引和drop_duplicates()方法去重
#df['key2']是series 转换为对series的操作
  key1 key2
0    a    a
1    a    a
2    3    b
3    4    b
4    5    c
-----
0    False
1     True
2    False
3    False
4    False
dtype: bool
-----
  key1 key2
0    a    a
2    3    b
3    4    b
4    5    c
-----
  key1 key2
0    a    a
2    3    b
3    4    b
4    5    c
-----
0    False
1     True
2    False
3     True
4    False
Name: key2, dtype: bool
-----
# 对series/df替换 .replace
s = pd.Series(list('ascaazsd'))
print(s.replace('a',np.nan))
#替换一组值

print(s.replace(['a','s'],np.nan))
#多个原值替换同一个目标值,可以把多个原值写成一个list

print(s.replace({'a':'hello world!','s':123}))
#多个原值替换多个目标值可以把原目的对应关系写成dict
0    NaN
1      s
2      c
3    NaN
4    NaN
5      z
6      s
7      d
dtype: object
0    NaN
1    NaN
2      c
3    NaN
4    NaN
5      z
6    NaN
7      d
dtype: object
0    hello world!
1             123
2               c
3    hello world!
4    hello world!
5               z
6             123
7               d
dtype: object
'''
【课程2.19】  数据分组

分组统计 - groupby功能

① 根据某些条件将数据拆分成组
② 对每个组独立应用函数
③ 将结果合并到一个数据结构中

Dataframe在行(axis=0)或列(axis=1)上进行分组,将一个函数应用到各个分组并产生一个新值,然后函数执行结果被合并到最终的结果对象中。

df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)
 
'''
'\n【课程2.19】  数据分组\n\n分组统计 - groupby功能\n\n① 根据某些条件将数据拆分成组\n② 对每个组独立应用函数\n③ 将结果合并到一个数据结构中\n\nDataframe在行(axis=0)或列(axis=1)上进行分组,将一个函数应用到各个分组并产生一个新值,然后函数执行结果被合并到最终的结果对象中。\n\ndf.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)\n \n'
# 以某列(属性)分组--分出的结果是行的集合

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
print(df)
print('------')

print(df.groupby('A'),type(df.groupby('A')))
print('------')
#直接groupby得到一个DFGroupBy的对象,是中间数据,没有进行计算

a=df.groupby('A').mean()
print(a)
# 1以A属性的不同值作为分类标准,2 mean()作为分类后的计算方法  3 mean()方法无法对str对象计算,自动屏蔽str列
print('------')

b=df.groupby(['A','B']).mean()
print(b)
# 1以A,B属性组的不同值组作为分类标准, 2mean()作为分离后的计算方法
print('------')

c=df.groupby('A')['D'].mean()
print(c)
# 1以A属性的不同值作为分类标准,2 mean()作为分类后的计算方法  3 只取出分类后的D属性进行mean()计算
     A      B         C         D
0  foo    one -0.626564  1.105908
1  bar    one -1.196017  0.550428
2  foo    two  0.117546  1.342129
3  bar  three -0.846058 -0.022835
4  foo    two  1.196126 -0.640600
5  bar    two  1.312649  0.241346
6  foo    one  0.133678 -0.921084
7  foo  three  0.493977 -0.775614
------
 
------
            C         D
A                      
bar -0.243142  0.256313
foo  0.262953  0.022148
------
                  C         D
A   B                        
bar one   -1.196017  0.550428
    three -0.846058 -0.022835
    two    1.312649  0.241346
foo one   -0.246443  0.092412
    three  0.493977 -0.775614
    two    0.656836  0.350764
------
A
bar    0.256313
foo    0.022148
Name: D, dtype: float64
# 分组 - 可迭代对象
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print(df.groupby('X'), type(df.groupby('X')))
print('-----')

print(list(df.groupby('X')))
print('-----')
#将按列(属性)分类后的dfgroupby list后会得到listoftuple tuple里包含两个数据,
#第一个是属性值,第二个是该属性值对应的子df

for n,g in df.groupby('X'):
    print(n)
    print('-----')
    print(g)
    print('-----')
   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
 
-----
[('A',    X  Y
0  A  1
2  A  3), ('B',    X  Y
1  B  4
3  B  2)]
-----
A
-----
   X  Y
0  A  1
2  A  3
-----
B
-----
   X  Y
1  B  4
3  B  2
-----
print(df.groupby(['X']).get_group('A'),'\n')
print(df.groupby(['X']).get_group('A'),'\n')
#get_group('分类属性的某个值')可以获取分组后的子df

grouped=df.groupby(['X'])
#grouped 是dfgroupby中间数据结构 即list of tuple tuple(属性值,子df)

print(grouped.groups)
print('-----')
#groups方法可以将分类的结果转化为dict(属性值:该属性值对应的行在原df的index位置以及)

print(grouped.groups['A'])
print('-----')
#既然是dict当然就可以[key]取值了

sz=grouped.size()
print(sz,type(sz))
print('-----')
#size返回一个series index为分类属性的不同值 值为该属性对应行的个数
   X  Y
0  A  1
2  A  3 

   X  Y
0  A  1
2  A  3 

{'A': Int64Index([0, 2], dtype='int64'), 'B': Int64Index([1, 3], dtype='int64')}
-----
Int64Index([0, 2], dtype='int64')
-----
X
A    2
B    2
dtype: int64 
-----
#也可以按照两列(两个属性)来分组、
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
print(df)
print('-----')
print(df.groupby(['A','B']).groups)
print('-----')
print(df.groupby(['A','B']).groups['foo','three'])

     A      B         C         D
0  foo    one  0.704837  0.314135
1  bar    one  0.968533  0.647802
2  foo    two  0.665623 -0.393522
3  bar  three  1.357061 -0.252542
4  foo    two -0.661338  0.233315
5  bar    two -0.168765  0.402762
6  foo    one -1.963668  0.486295
7  foo  three  0.154741  0.442135
-----
{('bar', 'one'): Int64Index([1], dtype='int64'), ('bar', 'three'): Int64Index([3], dtype='int64'), ('bar', 'two'): Int64Index([5], dtype='int64'), ('foo', 'one'): Int64Index([0, 6], dtype='int64'), ('foo', 'three'): Int64Index([7], dtype='int64'), ('foo', 'two'): Int64Index([2, 4], dtype='int64')}
-----
Int64Index([7], dtype='int64')
#按照每列属性不同的数据类型分类--分出来的结果是列的集合1 只要分出来结果是列集合 必须写groupby(axis=1)
df = pd.DataFrame({'data1':np.random.rand(2),
                  'data2':np.random.rand(2),
                  'key1':['a','b'],
                  'key2':['one','two']})
print(df)
print('-----')
print(df.dtypes)
print('-----')

for n,p in df.groupby(df.dtypes,axis=1):
    print(n)
    print('-----')
    print(p)
    print('-----')
#分类标准 即n为每列属性的不同数据类型 p为此数据类型对应的子df(几个列,之前groupby('A')A为某个column 子df为几个行)
      data1     data2 key1 key2
0  0.252462  0.444950    a  one
1  0.454087  0.091346    b  two
-----
data1    float64
data2    float64
key1      object
key2      object
dtype: object
-----
float64
-----
      data1     data2
0  0.252462  0.444950
1  0.454087  0.091346
-----
object
-----
  key1 key2
0    a  one
1    b  two
-----
#通过dict或series分组--分出来的结果是列的集合2
df = pd.DataFrame(np.arange(16).reshape(4,4),
                  columns = ['a','b','c','d'])
print(df)
print('-----')

map={'a':'one','b':'one','c':'two','d':'two','e':'three'}
#map的key是原来的标签 value为新标签
by_m=df.groupby(map,axis=1)
print(by_m.sum())
print('-----')

#把map转换为Series效果相同
s=pd.Series(map)
print(s)
print('-----')
by_s=df.groupby(s,axis=1)
print(by_s.sum())
    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
-----
   one  two
0    1    5
1    9   13
2   17   21
3   25   29
-----
a      one
b      one
c      two
d      two
e    three
dtype: object
-----
   one  two
0    1    5
1    9   13
2   17   21
3   25   29
# 通过函数分组 -通过index长度(axis=0分出来每组是行的集合)or columns长度(axis=1分出来每组是列的集合)

df = pd.DataFrame(np.arange(16).reshape(4,4),
                  columns = ['a','b','c','d'],
                 index = ['abc','bcd','aa','b'])
print(df)
print('-----')
print(df.groupby(len).sum())
print('-----')
print(df.groupby(len,axis=1).sum())
      a   b   c   d
abc   0   1   2   3
bcd   4   5   6   7
aa    8   9  10  11
b    12  13  14  15
-----
    a   b   c   d
1  12  13  14  15
2   8   9  10  11
3   4   6   8  10
-----
      1
abc   6
bcd  22
aa   38
b    54
# 分组计算函数方法
s = pd.Series([1,2,3,10,20,30],index=[1,2,3,1,2,3])
grouped=s.groupby(level=0)
# 唯一索引用.groupby(level=0),将同一个index的分为一组
print(grouped)
print(grouped.first(),'→ first:非NaN的第一个值\n')
print(grouped.last(),'→ last:非NaN的最后一个值\n')
print(grouped.sum(),'→ sum:非NaN的和\n')
print(grouped.mean(),'→ mean:非NaN的平均值\n')
print(grouped.median(),'→ median:非NaN的算术中位数\n')
print(grouped.count(),'→ count:非NaN的值\n')
print(grouped.min(),'→ min、max:非NaN的最小值、最大值\n')
print(grouped.std(),'→ std,var:非NaN的标准差和方差\n')
print(grouped.prod(),'→ prod:非NaN的积\n')

1    1
2    2
3    3
dtype: int64 → first:非NaN的第一个值

1    10
2    20
3    30
dtype: int64 → last:非NaN的最后一个值

1    11
2    22
3    33
dtype: int64 → sum:非NaN的和

1     5.5
2    11.0
3    16.5
dtype: float64 → mean:非NaN的平均值

1     5.5
2    11.0
3    16.5
dtype: float64 → median:非NaN的算术中位数

1    2
2    2
3    2
dtype: int64 → count:非NaN的值

1    1
2    2
3    3
dtype: int64 → min、max:非NaN的最小值、最大值

1     6.363961
2    12.727922
3    19.091883
dtype: float64 → std,var:非NaN的标准差和方差

1    10
2    40
3    90
dtype: int64 → prod:非NaN的积
# 多函数计算:agg() 即分组后对每组进行多个操作
df = pd.DataFrame({'a':[1,1,2,2],
                  'b':np.random.rand(4),
                  'c':np.random.rand(4),
                  'd':np.random.rand(4),})
print(df)
print('-----')

print(df.groupby('a').agg(['mean',np.sum]))
print('-----')
#多个操作写进list有两组写法 1 np.操作名 2‘操作名’

print(df.groupby('a')['b'].agg({'result1':np.mean,
                               'result2':np.sum}))
#取一列可以更改sum mean 的列名
   a         b         c         d
0  1  0.897877  0.355885  0.462194
1  1  0.893098  0.235715  0.262658
2  2  0.933889  0.032109  0.129018
3  2  0.319884  0.345379  0.697470
-----
          b                   c                   d          
       mean       sum      mean       sum      mean       sum
a                                                            
1  0.895487  1.790975  0.295800  0.591600  0.362426  0.724852
2  0.626887  1.253773  0.188744  0.377489  0.413244  0.826488
-----
    result1   result2
a                    
1  0.895487  1.790975
2  0.626887  1.253773


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:14: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
#作业answer

df=pd.DataFrame({'A':['one','two','three','one','two','three','one','two'],
                 'B':list('hhhhffff'),
                 'C':list(range(10,26,2)),
                 'D':np.random.rand(8)*-1,
                 'E':np.random.rand(8)
                })
print(df)
print('-----')

# 1
ga1=df.groupby('A')
print('C,D mean\n',ga1[['C','D']].mean())
print('-----')

# 2
gab=df.groupby(['A','B'])
print('D E sum\n',gab[['D','E']].sum())
print('-----')

# 3
print(ga1.groups)
print('-----')

# 4
gtype=df.groupby(df.dtypes,axis=1)
print(gtype.sum())
print('-----')

#5
map={'C':'y','D':'y'}
gcdone=df.groupby(map,axis=1)
print(gcdone.sum())
print('-----')

#6
gcb=df.groupby('B')
print(gcb.agg(['mean','sum','max','min']))
       A  B   C         D         E
0    one  h  10 -0.482264  0.943953
1    two  h  12 -0.801970  0.006297
2  three  h  14 -0.830659  0.146785
3    one  h  16 -0.472067  0.961370
4    two  f  18 -0.626824  0.264676
5  three  f  20 -0.897931  0.112642
6    one  f  22 -0.433889  0.282091
7    two  f  24 -0.124638  0.905271
-----
C,D mean
         C         D
A                  
one    16 -0.462740
three  17 -0.864295
two    18 -0.517811
-----
D E sum
                 D         E
A     B                    
one   f -0.433889  0.282091
      h -0.954331  1.905322
three f -0.897931  0.112642
      h -0.830659  0.146785
two   f -0.751462  1.169947
      h -0.801970  0.006297
-----
{'one': Int64Index([0, 3, 6], dtype='int64'), 'three': Int64Index([2, 5], dtype='int64'), 'two': Int64Index([1, 4, 7], dtype='int64')}
-----
   int64   float64  object
0     10  0.461689    oneh
1     12 -0.795673    twoh
2     14 -0.683875  threeh
3     16  0.489303    oneh
4     18 -0.362148    twof
5     20 -0.785289  threef
6     22 -0.151798    onef
7     24  0.780634    twof
-----
           y
0   9.517736
1  11.198030
2  13.169341
3  15.527933
4  17.373176
5  19.102069
6  21.566111
7  23.875362
-----
     C                     D                                       E  \
  mean sum max min      mean       sum       max       min      mean   
B                                                                      
f   21  84  24  18 -0.520821 -2.083282 -0.124638 -0.897931  0.391170   
h   13  52  16  10 -0.646740 -2.586960 -0.472067 -0.830659  0.514601   

                                 
        sum       max       min  
B                                
f  1.564681  0.905271  0.112642  
h  2.058404  0.961370  0.006297  
'''
总结2.19
1 groupby返回dataframegroupby结果两种读取方法

1.1 通过list(df.group())
会得到listoftuple tuple里包含两个数据,
第一个是标签名,第二个是该标签名对应的子df

1.2 通过for语句迭代
for n,g in df.groupby('X'):
n为标签名,g是该标签名对应的子df

1.3 通过df.group().getgroup(['标签名'])得到该标签名对应的子df

1.4 df.group().groups #没()
groups方法可以将分类的结果转化为dict(key:标签名 value:该标签名对应的行在原df的index位置以及数据类型)
注:既然是dict当然就可以[key]取值了
print(grouped.groups['A'])

1.5 df.group().size()
size返回一个series index为分类不同标签名 值为该标签名对应的个数




2 groupby得到的每个分组为行集合的情况
2.1 df.groupby('A')单个属性分组
2.2 df.groupby(['A'])多个属性分组
2.3 注 groupby()后只是返回dataframegroupby中间结果,没有计算,要df.groupby(['A']).mean()等方法才能计算分类结果
2.4 通过函数分组 -index长度
print(df.groupby(len).sum())





3 groupby得到的每个分组为列集合的情况
注;只要分出来结果是列集合 必须写groupby(axis=1)

3.1 for n,p in df.groupby(df.dtypes,axis=1):
即n为每列属性的不同数据类型 p为此数据类型对应的子df

3.2通过dict或series分组
by_m=df.groupby(map,axis=1)
by_m=df.groupby(s,axis=1)

3.3通过函数分组 -columns长度
print(df.groupby(len,axis=1).sum())




4 分组计算函数方法 看笔记有很多种,和np的统计方法大致相同
注
s = pd.Series([1,2,3,10,20,30],index=[1,2,3,1,2,3])
grouped=s.groupby(level=0)
# 唯一索引用.groupby(level=0),将同一个index的分为一组




5多函数计算:agg() 即分组后对每组进行多个操作
#多个操作写进list有两组写法 1 np.操作名 2‘操作名’
'''
"\n总结2.19\n1 groupby返回dataframegroupby结果两种读取方法\n\n1.1 通过list(df.group())\n会得到listoftuple tuple里包含两个数据,\n第一个是标签名,第二个是该标签名对应的子df\n\n1.2 通过for语句迭代\nfor n,g in df.groupby('X'):\nn为标签名,g是该标签名对应的子df\n\n1.3 通过df.group().getgroup(['标签名'])得到该标签名对应的子df\n\n1.4 df.group().groups #没()\ngroups方法可以将分类的结果转化为dict(key:标签名 value:该标签名对应的行在原df的index位置以及数据类型)\n注:既然是dict当然就可以[key]取值了\nprint(grouped.groups['A'])\n\n1.5 df.group().size()\nsize返回一个series index为分类不同标签名 值为该标签名对应的个数\n\n\n\n\n2 groupby得到的每个分组为行集合的情况\n2.1 df.groupby('A')单个属性分组\n2.2 df.groupby(['A'])多个属性分组\n2.3 注 groupby()后只是返回dataframegroupby中间结果,没有计算,要df.groupby(['A']).mean()等方法才能计算分类结果\n2.4 通过函数分组 -index长度\nprint(df.groupby(len).sum())\n\n\n\n\n\n3 groupby得到的每个分组为列集合的情况\n注;只要分出来结果是列集合 必须写groupby(axis=1)\n\n3.1 for n,p in df.groupby(df.dtypes,axis=1):\n即n为每列属性的不同数据类型 p为此数据类型对应的子df\n\n3.2通过dict或series分组\nby_m=df.groupby(map,axis=1)\nby_m=df.groupby(s,axis=1)\n\n3.3通过函数分组 -columns长度\nprint(df.groupby(len,axis=1).sum())\n\n\n\n\n4 分组计算函数方法 看笔记有很多种,和np的统计方法大致相同\n注\ns = pd.Series([1,2,3,10,20,30],index=[1,2,3,1,2,3])\ngrouped=s.groupby(level=0)\n# 唯一索引用.groupby(level=0),将同一个index的分为一组\n\n\n\n\n5多函数计算:agg() 即分组后对每组进行多个操作\n#多个操作写进list有两组写法 1 np.操作名 2‘操作名’\n"
'''
【课程2.20】  分组转换及一般性“拆分-应用-合并”

transform / apply
 
'''
'\n【课程2.20】  分组转换及一般性“拆分-应用-合并”\n\ntransform / apply\n \n'
#数据结构化转换 transform
df = pd.DataFrame({'data1':np.random.rand(5),
                  'data2':np.random.rand(5),
                  'key1':list('aabba'),
                  'key2':['one','two','one','two','one']})
k_mean = df.groupby('key1').mean()

print(df)
print('-----')
print(k_mean)
#问题 如何将k_mean连接到df上
#方法1 直接用merge
print(pd.merge(df,k_mean,left_on='key1',right_index=True))
print('-----')

#方法2 先transform再concat/merge
#注 transform作用 
#将groupby的运算方法如mean按照原来df的格式进行格式化 
#方便直接concat连接 或merge连接
k_mean2=df.groupby('key1').transform('mean')
print(k_mean2)
print('-----')

#法二之concat连接
print(pd.concat([df,k_mean2],axis=1))
print('-----')

#法二之merge连接
print(pd.merge(df,k_mean2,left_index=True,right_index=True))
      data1     data2 key1 key2
0  0.749995  0.342717    a  one
1  0.822763  0.717885    a  two
2  0.543269  0.704744    b  one
3  0.423887  0.230721    b  two
4  0.470420  0.623854    a  one
-----
         data1     data2
key1                    
a     0.681060  0.561485
b     0.483578  0.467732
    data1_x   data2_x key1 key2   data1_y   data2_y
0  0.749995  0.342717    a  one  0.681060  0.561485
1  0.822763  0.717885    a  two  0.681060  0.561485
4  0.470420  0.623854    a  one  0.681060  0.561485
2  0.543269  0.704744    b  one  0.483578  0.467732
3  0.423887  0.230721    b  two  0.483578  0.467732
-----
      data1     data2
0  0.681060  0.561485
1  0.681060  0.561485
2  0.483578  0.467732
3  0.483578  0.467732
4  0.681060  0.561485
-----
      data1     data2 key1 key2     data1     data2
0  0.749995  0.342717    a  one  0.681060  0.561485
1  0.822763  0.717885    a  two  0.681060  0.561485
2  0.543269  0.704744    b  one  0.483578  0.467732
3  0.423887  0.230721    b  two  0.483578  0.467732
4  0.470420  0.623854    a  one  0.681060  0.561485
-----
    data1_x   data2_x key1 key2   data1_y   data2_y
0  0.749995  0.342717    a  one  0.681060  0.561485
1  0.822763  0.717885    a  two  0.681060  0.561485
2  0.543269  0.704744    b  one  0.483578  0.467732
3  0.423887  0.230721    b  two  0.483578  0.467732
4  0.470420  0.623854    a  one  0.681060  0.561485
# 一般化Groupby方法:apply 即使groupby使用自定义方法
df = pd.DataFrame({'data1':np.random.rand(5),
                  'data2':np.random.rand(5),
                  'key1':list('aabba'),
                  'key2':['one','two','one','two','one']})
print(df)
print('-----')

print(df.groupby('key1').apply(lambda x: x.describe()))
#apply使groupby后的分组直接执行()中的函数
#x.describe() 显示一组常用的统计量
print('-----')

def f1(d,n):
    return(d.sort_index().loc[:n])
#f1返回df sortindex后的前n行

def f2(d,k1):
    return (d[k1])
#f2返回df k1列
#注 用于apply的函数第一个参数必须是自身

print(df.groupby('key1').apply(f1,1))
print('-----')
print(df.groupby('key1').apply(f2,'key1'))

# 参数直接写在后面,也可以为.apply(f_df,n = 2))
      data1     data2 key1 key2
0  0.344980  0.415581    a  one
1  0.989660  0.130394    a  two
2  0.374878  0.102374    b  one
3  0.778417  0.300815    b  two
4  0.342451  0.932157    a  one
-----
               data1     data2
key1                          
a    count  3.000000  3.000000
     mean   0.559030  0.492711
     std    0.372939  0.406409
     min    0.342451  0.130394
     25%    0.343715  0.272987
     50%    0.344980  0.415581
     75%    0.667320  0.673869
     max    0.989660  0.932157
b    count  2.000000  2.000000
     mean   0.576647  0.201595
     std    0.285345  0.140319
     min    0.374878  0.102374
     25%    0.475763  0.151985
     50%    0.576647  0.201595
     75%    0.677532  0.251205
     max    0.778417  0.300815
-----
          data1     data2 key1 key2
key1                               
a    0  0.34498  0.415581    a  one
     1  0.98966  0.130394    a  two
-----
key1   
a     0    a
      1    a
      4    a
b     2    b
      3    b
Name: key1, dtype: object
#将groupby的运算方法如mean按照原来df的格式进行格式化 
#apply 即使groupby使用自定义方法
#作业answer
df=pd.DataFrame({'data1':np.random.rand(8),
                 'data2':np.random.rand(8),
                 'key':list('aabbabab')})
print(df)
print('-----')

k_sum=df.groupby('key').sum()
print(k_sum)

#法1 直接merge
print(pd.merge(df,k_sum,left_on='key',right_index=True))

#法2.1 transform后concat
t1=df.groupby('key').transform('sum')
print(pd.concat([df,t1],axis=1))

#法2.2 transform后merge
t2=df.groupby('key').transform('sum')
print(pd.merge(df,t2,left_index=True,right_index=True))
      data1     data2 key
0  0.786537  0.171086   a
1  0.033188  0.246577   a
2  0.366801  0.198738   b
3  0.082880  0.923571   b
4  0.718297  0.035098   a
5  0.024293  0.446508   b
6  0.598162  0.796003   a
7  0.896416  0.530436   b
-----
        data1     data2
key                    
a    2.136183  1.248764
b    1.370391  2.099254
    data1_x   data2_x key   data1_y   data2_y
0  0.786537  0.171086   a  2.136183  1.248764
1  0.033188  0.246577   a  2.136183  1.248764
4  0.718297  0.035098   a  2.136183  1.248764
6  0.598162  0.796003   a  2.136183  1.248764
2  0.366801  0.198738   b  1.370391  2.099254
3  0.082880  0.923571   b  1.370391  2.099254
5  0.024293  0.446508   b  1.370391  2.099254
7  0.896416  0.530436   b  1.370391  2.099254
      data1     data2 key     data1     data2
0  0.786537  0.171086   a  2.136183  1.248764
1  0.033188  0.246577   a  2.136183  1.248764
2  0.366801  0.198738   b  1.370391  2.099254
3  0.082880  0.923571   b  1.370391  2.099254
4  0.718297  0.035098   a  2.136183  1.248764
5  0.024293  0.446508   b  1.370391  2.099254
6  0.598162  0.796003   a  2.136183  1.248764
7  0.896416  0.530436   b  1.370391  2.099254
    data1_x   data2_x key   data1_y   data2_y
0  0.786537  0.171086   a  2.136183  1.248764
1  0.033188  0.246577   a  2.136183  1.248764
2  0.366801  0.198738   b  1.370391  2.099254
3  0.082880  0.923571   b  1.370391  2.099254
4  0.718297  0.035098   a  2.136183  1.248764
5  0.024293  0.446508   b  1.370391  2.099254
6  0.598162  0.796003   a  2.136183  1.248764
7  0.896416  0.530436   b  1.370391  2.099254
'''
【课程2.21】  透视表及交叉表

类似excel数据透视 - pivot table / crosstab
 
'''
'\n【课程2.21】  透视表及交叉表\n\n类似excel数据透视 - pivot table / crosstab\n \n'
# 透视表:pivot_table

# pd.pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')

#data :传入df
#values:传入一列名表明aggfunc操作的对象 
#index: index:传入一列名(属性) 然后此属性的不同值作为透视表的纵轴 index
#columns:columns传入一列名(属性) 然后此属性的不同值作为透视表的横轴 columns
#aggfunc :注意透视表类似矩阵,
#aggfunc的操作对象是数据集合(所以叫aggregation func)   所有【index= i,columns =i】的values
#每个位置【index i,columns i】是一个数据集合(同index i,同columns i的所有values)进行aggfunc操作后的结果

#特别注意 作业里透视表还有一种用法 即不传入colunms 只穿data和index 则透视表变得和groupby方法相同
date = ['2017-5-1','2017-5-2','2017-5-3']*3
rng = pd.to_datetime(date)
df = pd.DataFrame({'date':rng,
                   'key':list('abcdabcda'),
                  'values':np.random.rand(9)*10})
print(df)
print('-----')

print(pd.pivot_table(data=df,values='values',index='date',columns='key',aggfunc='count'))
print('-----')
#‘count’or len方法记录每个数据集合【index i,columns i】数据的个数

#index可以多列 出现多重index透视表
#每个位置【index1 i,index2 i,columns i】是一个数据集合--
#(同index1 i,同index2 i,同columns i 的所有values)进行aggfunc操作后的结果
print(pd.pivot_table(data=df,values='values',index=['date','key'],aggfunc='sum'))
        date key    values
0 2017-05-01   a  2.292729
1 2017-05-02   b  8.122388
2 2017-05-03   c  9.608781
3 2017-05-01   d  2.833724
4 2017-05-02   a  4.926847
5 2017-05-03   b  8.114626
6 2017-05-01   c  4.810743
7 2017-05-02   d  4.362093
8 2017-05-03   a  4.907056
-----
key           a    b    c    d
date                          
2017-05-01  1.0  NaN  1.0  1.0
2017-05-02  1.0  1.0  NaN  1.0
2017-05-03  1.0  1.0  1.0  NaN
-----
                  values
date       key          
2017-05-01 a    2.292729
           c    4.810743
           d    2.833724
2017-05-02 a    4.926847
           b    8.122388
           d    4.362093
2017-05-03 a    4.907056
           b    8.114626
           c    9.608781
# 交叉表:crosstab
#默认用于计算每个因子出现的频率,传递参数values和aggfun后变成数据透视表
df = pd.DataFrame({'A': [1, 2, 2, 2, 2],
                   'B': [3, 3, 4, 4, 4],
                   'C': [1, 1, np.nan, 1, 1]})
print(df)
print('-----')

#默认用法--不传递values 用于计数
#1.1记频数
#第一个参数为index 纵轴 第二个参数为columns 横轴
print(pd.crosstab(df['A'],df['B']))
print('-----')

#1.2 normalize=True 记频率
print(pd.crosstab(df['A'],df['B'],normalize=True))
print('-----')

#用法二 传递values和aggfunc 进化为数据透视表
print(pd.crosstab(df['A'],df['B'],values=df['C'],aggfunc=np.sum))
print('-----')
# 这里相当于以A和B界定分组,计算出每组中第三个系列C的值

#注:margins参数=True 添加行/列边距(小计)
print(pd.crosstab(df['A'],df['B'],values=df['C'],aggfunc=np.sum,margins=True))

   A  B    C
0  1  3  1.0
1  2  3  1.0
2  2  4  NaN
3  2  4  1.0
4  2  4  1.0
-----
B  3  4
A      
1  1  0
2  1  3
-----
B    3    4
A          
1  0.2  0.0
2  0.2  0.6
-----
B    3    4
A          
1  1.0  NaN
2  1.0  2.0
-----
B      3    4  All
A                 
1    1.0  NaN  1.0
2    1.0  2.0  3.0
All  2.0  2.0  4.0
#作业answer
df=pd.DataFrame({'A':['one','two','three','one','two','three','one','two'],
                 'B':list('hhhhffff'),
                 'C':list(range(10,26,2)),
                 'D':np.random.rand(8)*-1,
                 'E':np.random.rand(8)
                })
# 1
r1=pd.pivot_table(data=df,index='A',values=['C','D'],aggfunc='mean')
print(r1)
print('-----')

# 2
r2=pd.pivot_table(data=df,index=['A','B'],values=['D','E'],aggfunc=['sum','mean'])
print(r2)
print('-----')

# 3
r3=pd.crosstab(df['B'],df['A'])
print(r3)
        C         D
A                  
one    16 -0.558799
three  17 -0.703424
two    18 -0.552824
-----
              sum                mean          
                D         E         D         E
A     B                                        
one   f -0.391410  0.035270 -0.391410  0.035270
      h -1.284988  1.052825 -0.642494  0.526412
three f -0.600380  0.938482 -0.600380  0.938482
      h -0.806469  0.456976 -0.806469  0.456976
two   f -1.158324  0.615345 -0.579162  0.307672
      h -0.500147  0.211486 -0.500147  0.211486
-----
A  one  three  two
B                 
f    1      1    2
h    2      1    1
'''
总结2.21

1 透视表:pivot_table
pd.pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')

2 交叉表:crosstab
默认用于计算每个因子出现的频率,传递参数values和aggfun后变成数据透视表

'''
"\n总结2.21\n\n1 透视表:pivot_table\npd.pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')\n\n2 交叉表:crosstab\n默认用于计算每个因子出现的频率,传递参数values和aggfun后变成数据透视表\n\n"
'''
【课程2.22】  数据读取

核心:read_table, read_csv, read_excel
 
'''
'\n【课程2.22】  数据读取\n\n核心:read_table, read_csv, read_excel\n \n'

'''
va1,va2,va3,va4
1,2,3,4
2,3,4,5
3,4,5,6
4,5,6,7

'''


import os
os.chdir(r'E:\软件自学\数据分析--网易微专业\CLASSDATA_ch03重点工具掌握:数据解析核心技巧\CH02数据分析工具:Pandas')


data1=pd.read_table('data1.txt',delimiter=',')
print(data1)
print('-----')

#delimiter指分隔符

data2=pd.read_table('data1.txt',delimiter=',',header=0,index_col=1,engine='python',encoding='gbk')
print(data2)
print('-----')
#header 代表从哪行开始读
#index_col= 代表将哪列抽出来做index
#engine='python'最好加 可以选择C或者是python。C引擎快但是Python引擎功能更加完备
#encoding 支持中文的只有两种 'gbk' 和 'utf8'
   va1  va2  va3  va4
0    1    2    3    4
1    2    3    4    5
2    3    4    5    6
3    4    5    6    7
-----
     va1  va3  va4
va2               
2      1    3    4
3      2    4    5
4      3    5    6
5      4    6    7
-----


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:16: FutureWarning: read_table is deprecated, use read_csv instead.
  app.launch_new_instance()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: FutureWarning: read_table is deprecated, use read_csv instead.
# 读取csv数据:read_csv
# 先熟悉一下excel怎么导出csv,记得导出时记得编码方式 'gbk' or 'utf8'
#所谓csv就是delimiter=','的txt
# 大多数情况先将excel导出csv,再读取,因为csv读取速度比xlsx快很多
data3=pd.read_csv('data2.csv',engine = 'python')
print(data3)
   va1  va2  va3  va4
0    1    2    3    4
1    2    3    4    5
2    3    4    5    6
3    4    5    6    7
# 读取excel数据:read_excel

data4=pd.read_excel('地市级党委书记数据库(2000-10).xlsx',sheetname='中国人民共和国地市级党委书记数据库(2000-10)',header=0)
print(data4)
print('-----')

data5=pd.read_excel('地市级党委书记数据库(2000-10).xlsx',sheetname=None,header=0)
print(data5)
print('-----')
#io:文件路径

#sheetname
#1可以用int也可以用String  int/string 返回的是dataframe
#2sheetname=[0,1] 而none和list返回的是dict

# header:指定列名行,默认0,即取第一行
# index_col:指定列为索引列,也可以使用u”strings”
      省级政区代码    省级政区名称  地市级政区代码   地市级政区名称    年份 党委书记姓名  出生年份  出生月份  籍贯省份代码  \
0     130000       河北省   130100      石家庄市  2000    陈来立   NaN   NaN     NaN   
1     130000       河北省   130100      石家庄市  2001    吴振华   NaN   NaN     NaN   
2     130000       河北省   130100      石家庄市  2002    吴振华   NaN   NaN     NaN   
3     130000       河北省   130100      石家庄市  2003    吴振华   NaN   NaN     NaN   
4     130000       河北省   130100      石家庄市  2004    吴振华   NaN   NaN     NaN   
5     130000       河北省   130100      石家庄市  2005    吴振华   NaN   NaN     NaN   
6     130000       河北省   130100      石家庄市  2006    吴振华   NaN   NaN     NaN   
7     130000       河北省   130100      石家庄市  2007    吴显国   NaN   NaN     NaN   
8     130000       河北省   130100      石家庄市  2008    吴显国   NaN   NaN     NaN   
9     130000       河北省   130100      石家庄市  2009     车俊   NaN   NaN     NaN   
10    130000       河北省   130100      石家庄市  2010    孙瑞彬   NaN   NaN     NaN   
11    130000       河北省   130200       唐山市  2000    白润璋   NaN   NaN     NaN   
12    130000       河北省   130200       唐山市  2001    白润璋   NaN   NaN     NaN   
13    130000       河北省   130200       唐山市  2002    白润璋   NaN   NaN     NaN   
14    130000       河北省   130200       唐山市  2003     张和   NaN   NaN     NaN   
15    130000       河北省   130200       唐山市  2004     张和   NaN   NaN     NaN   
16    130000       河北省   130200       唐山市  2005     张和   NaN   NaN     NaN   
17    130000       河北省   130200       唐山市  2006     张和   NaN   NaN     NaN   
18    130000       河北省   130200       唐山市  2007     赵勇   NaN   NaN     NaN   
19    130000       河北省   130200       唐山市  2008     赵勇   NaN   NaN     NaN   
20    130000       河北省   130200       唐山市  2009     赵勇   NaN   NaN     NaN   
21    130000       河北省   130200       唐山市  2010     赵勇   NaN   NaN     NaN   
22    130000       河北省   130300      秦皇岛市  2000    王建忠   NaN   NaN     NaN   
23    130000       河北省   130300      秦皇岛市  2001    王建忠   NaN   NaN     NaN   
24    130000       河北省   130300      秦皇岛市  2002    王建忠   NaN   NaN     NaN   
25    130000       河北省   130300      秦皇岛市  2003    宋长瑞   NaN   NaN     NaN   
26    130000       河北省   130300      秦皇岛市  2004    宋长瑞   NaN   NaN     NaN   
27    130000       河北省   130300      秦皇岛市  2005    宋长瑞   NaN   NaN     NaN   
28    130000       河北省   130300      秦皇岛市  2006    宋长瑞   NaN   NaN     NaN   
29    130000       河北省   130300      秦皇岛市  2007    王三堂   NaN   NaN     NaN   
...      ...       ...      ...       ...   ...    ...   ...   ...     ...   
3633  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2003    NaN   NaN   NaN     NaN   
3634  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2004    NaN   NaN   NaN     NaN   
3635  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2005    NaN   NaN   NaN     NaN   
3636  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2006    NaN   NaN   NaN     NaN   
3637  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2007    NaN   NaN   NaN     NaN   
3638  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2008    NaN   NaN   NaN     NaN   
3639  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2009    NaN   NaN   NaN     NaN   
3640  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2010    NaN   NaN   NaN     NaN   
3641  650000  新疆维吾尔自治区   654200      塔城地区  2000    NaN   NaN   NaN     NaN   
3642  650000  新疆维吾尔自治区   654200      塔城地区  2001    NaN   NaN   NaN     NaN   
3643  650000  新疆维吾尔自治区   654200      塔城地区  2002    NaN   NaN   NaN     NaN   
3644  650000  新疆维吾尔自治区   654200      塔城地区  2003    NaN   NaN   NaN     NaN   
3645  650000  新疆维吾尔自治区   654200      塔城地区  2004    NaN   NaN   NaN     NaN   
3646  650000  新疆维吾尔自治区   654200      塔城地区  2005    NaN   NaN   NaN     NaN   
3647  650000  新疆维吾尔自治区   654200      塔城地区  2006    NaN   NaN   NaN     NaN   
3648  650000  新疆维吾尔自治区   654200      塔城地区  2007    NaN   NaN   NaN     NaN   
3649  650000  新疆维吾尔自治区   654200      塔城地区  2008    NaN   NaN   NaN     NaN   
3650  650000  新疆维吾尔自治区   654200      塔城地区  2009    NaN   NaN   NaN     NaN   
3651  650000  新疆维吾尔自治区   654200      塔城地区  2010    NaN   NaN   NaN     NaN   
3652  650000  新疆维吾尔自治区   654300     阿勒泰地区  2000    NaN   NaN   NaN     NaN   
3653  650000  新疆维吾尔自治区   654300     阿勒泰地区  2001    NaN   NaN   NaN     NaN   
3654  650000  新疆维吾尔自治区   654300     阿勒泰地区  2002    NaN   NaN   NaN     NaN   
3655  650000  新疆维吾尔自治区   654300     阿勒泰地区  2003    NaN   NaN   NaN     NaN   
3656  650000  新疆维吾尔自治区   654300     阿勒泰地区  2004    NaN   NaN   NaN     NaN   
3657  650000  新疆维吾尔自治区   654300     阿勒泰地区  2005    NaN   NaN   NaN     NaN   
3658  650000  新疆维吾尔自治区   654300     阿勒泰地区  2006    NaN   NaN   NaN     NaN   
3659  650000  新疆维吾尔自治区   654300     阿勒泰地区  2007    NaN   NaN   NaN     NaN   
3660  650000  新疆维吾尔自治区   654300     阿勒泰地区  2008    NaN   NaN   NaN     NaN   
3661  650000  新疆维吾尔自治区   654300     阿勒泰地区  2009    NaN   NaN   NaN     NaN   
3662  650000  新疆维吾尔自治区   654300     阿勒泰地区  2010    NaN   NaN   NaN     NaN   

     籍贯省份名称  ...   民族   教育 是否是党校教育(是=1,否=0) 专业:人文 专业:社科  专业:理工  专业:农科  专业:医科  \
0       NaN  ...  NaN   硕士              1.0   NaN   NaN    NaN    NaN    NaN   
1       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
2       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
3       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
4       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
5       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
6       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
7       NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
8       NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
9       NaN  ...  NaN   本科              1.0   0.0   1.0    0.0    0.0    0.0   
10      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
11      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
12      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
13      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
14      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
15      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
16      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
17      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
18      NaN  ...  NaN   博士              0.0   0.0   1.0    0.0    0.0    0.0   
19      NaN  ...  NaN   博士              0.0   0.0   1.0    0.0    0.0    0.0   
20      NaN  ...  NaN   博士              0.0   0.0   1.0    0.0    0.0    0.0   
21      NaN  ...  NaN   博士              0.0   0.0   1.0    0.0    0.0    0.0   
22      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
23      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
24      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
25      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
26      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
27      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
28      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
29      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
...     ...  ...  ...  ...              ...   ...   ...    ...    ...    ...   
3633    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3634    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3635    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3636    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3637    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3638    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3639    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3640    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3641    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3642    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3643    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3644    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3645    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3646    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3647    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3648    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3649    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3650    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3651    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3652    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3653    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3654    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3655    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3656    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3657    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3658    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3659    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3660    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3661    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3662    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   

      入党年份  工作年份  
0      NaN   NaN  
1      NaN   NaN  
2      NaN   NaN  
3      NaN   NaN  
4      NaN   NaN  
5      NaN   NaN  
6      NaN   NaN  
7      NaN   NaN  
8      NaN   NaN  
9      NaN   NaN  
10     NaN   NaN  
11     NaN   NaN  
12     NaN   NaN  
13     NaN   NaN  
14     NaN   NaN  
15     NaN   NaN  
16     NaN   NaN  
17     NaN   NaN  
18     NaN   NaN  
19     NaN   NaN  
20     NaN   NaN  
21     NaN   NaN  
22     NaN   NaN  
23     NaN   NaN  
24     NaN   NaN  
25     NaN   NaN  
26     NaN   NaN  
27     NaN   NaN  
28     NaN   NaN  
29     NaN   NaN  
...    ...   ...  
3633   NaN   NaN  
3634   NaN   NaN  
3635   NaN   NaN  
3636   NaN   NaN  
3637   NaN   NaN  
3638   NaN   NaN  
3639   NaN   NaN  
3640   NaN   NaN  
3641   NaN   NaN  
3642   NaN   NaN  
3643   NaN   NaN  
3644   NaN   NaN  
3645   NaN   NaN  
3646   NaN   NaN  
3647   NaN   NaN  
3648   NaN   NaN  
3649   NaN   NaN  
3650   NaN   NaN  
3651   NaN   NaN  
3652   NaN   NaN  
3653   NaN   NaN  
3654   NaN   NaN  
3655   NaN   NaN  
3656   NaN   NaN  
3657   NaN   NaN  
3658   NaN   NaN  
3659   NaN   NaN  
3660   NaN   NaN  
3661   NaN   NaN  
3662   NaN   NaN  

[3663 rows x 23 columns]
-----
OrderedDict([('说明', Empty DataFrame
Columns: [中国人民共和国地市级党委书记数据库(2000-2010)
Data on Prefectural Party Secretary of P. R. China, 2000-2010

如使用本数据库,请在论文中致谢或引用。使用过程中遇到问题,请联系[email protected]。引用格式:

陈硕,2015,从治理到制度:央地关系下的中国政治精英选拔,1368-2010。复旦大学经济系工作论文

Please kindly acknowledge us (see below) when publishing your work using this data source--be it in whole or in part. Contact [email protected] if you have any questions concerning either the data structure or programming.

Shuo Chen, 2015, “From Governance to Institutionalization: Political Selection from the Perspective of Central-local Relations in China--Past and Present (1368-2010)” Department of Economics, Fudan University Working Paper.

地市级党委书记数据库由复旦大学经济学院陈硕副教授及其团队(Fudan WTF Social Science Lab)成员整理。数据库涵盖全国27个省(及自治区)333个地级市(及副省级城市、自治州和地区)2000-2010年989位党委书记个人信息。数据类型为市-年平衡面板, 3663个市-年-党委书记观察值(缺失642个观察值信息,实有3021个观察值)。以下是数据建立方法:

1. 查阅各省(及自治区)年鉴整理出2000-2010年间各地市级行政区划党委书记名单
2. 通过人民网、新华网、各地方政府网站及百度百科查询各党委书记简历
3. 对简历进行电子化整理获得现有数据库
4. 当某市某年有超过1位党委书记任职时,我们以最后上任书记信息为准
]
Index: []), ('中国人民共和国地市级党委书记数据库(2000-10)',       省级政区代码    省级政区名称  地市级政区代码   地市级政区名称    年份 党委书记姓名  出生年份  出生月份  籍贯省份代码  \
0     130000       河北省   130100      石家庄市  2000    陈来立   NaN   NaN     NaN   
1     130000       河北省   130100      石家庄市  2001    吴振华   NaN   NaN     NaN   
2     130000       河北省   130100      石家庄市  2002    吴振华   NaN   NaN     NaN   
3     130000       河北省   130100      石家庄市  2003    吴振华   NaN   NaN     NaN   
4     130000       河北省   130100      石家庄市  2004    吴振华   NaN   NaN     NaN   
5     130000       河北省   130100      石家庄市  2005    吴振华   NaN   NaN     NaN   
6     130000       河北省   130100      石家庄市  2006    吴振华   NaN   NaN     NaN   
7     130000       河北省   130100      石家庄市  2007    吴显国   NaN   NaN     NaN   
8     130000       河北省   130100      石家庄市  2008    吴显国   NaN   NaN     NaN   
9     130000       河北省   130100      石家庄市  2009     车俊   NaN   NaN     NaN   
10    130000       河北省   130100      石家庄市  2010    孙瑞彬   NaN   NaN     NaN   
11    130000       河北省   130200       唐山市  2000    白润璋   NaN   NaN     NaN   
12    130000       河北省   130200       唐山市  2001    白润璋   NaN   NaN     NaN   
13    130000       河北省   130200       唐山市  2002    白润璋   NaN   NaN     NaN   
14    130000       河北省   130200       唐山市  2003     张和   NaN   NaN     NaN   
15    130000       河北省   130200       唐山市  2004     张和   NaN   NaN     NaN   
16    130000       河北省   130200       唐山市  2005     张和   NaN   NaN     NaN   
17    130000       河北省   130200       唐山市  2006     张和   NaN   NaN     NaN   
18    130000       河北省   130200       唐山市  2007     赵勇   NaN   NaN     NaN   
19    130000       河北省   130200       唐山市  2008     赵勇   NaN   NaN     NaN   
20    130000       河北省   130200       唐山市  2009     赵勇   NaN   NaN     NaN   
21    130000       河北省   130200       唐山市  2010     赵勇   NaN   NaN     NaN   
22    130000       河北省   130300      秦皇岛市  2000    王建忠   NaN   NaN     NaN   
23    130000       河北省   130300      秦皇岛市  2001    王建忠   NaN   NaN     NaN   
24    130000       河北省   130300      秦皇岛市  2002    王建忠   NaN   NaN     NaN   
25    130000       河北省   130300      秦皇岛市  2003    宋长瑞   NaN   NaN     NaN   
26    130000       河北省   130300      秦皇岛市  2004    宋长瑞   NaN   NaN     NaN   
27    130000       河北省   130300      秦皇岛市  2005    宋长瑞   NaN   NaN     NaN   
28    130000       河北省   130300      秦皇岛市  2006    宋长瑞   NaN   NaN     NaN   
29    130000       河北省   130300      秦皇岛市  2007    王三堂   NaN   NaN     NaN   
...      ...       ...      ...       ...   ...    ...   ...   ...     ...   
3633  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2003    NaN   NaN   NaN     NaN   
3634  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2004    NaN   NaN   NaN     NaN   
3635  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2005    NaN   NaN   NaN     NaN   
3636  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2006    NaN   NaN   NaN     NaN   
3637  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2007    NaN   NaN   NaN     NaN   
3638  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2008    NaN   NaN   NaN     NaN   
3639  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2009    NaN   NaN   NaN     NaN   
3640  650000  新疆维吾尔自治区   654000  伊犁哈萨克自治州  2010    NaN   NaN   NaN     NaN   
3641  650000  新疆维吾尔自治区   654200      塔城地区  2000    NaN   NaN   NaN     NaN   
3642  650000  新疆维吾尔自治区   654200      塔城地区  2001    NaN   NaN   NaN     NaN   
3643  650000  新疆维吾尔自治区   654200      塔城地区  2002    NaN   NaN   NaN     NaN   
3644  650000  新疆维吾尔自治区   654200      塔城地区  2003    NaN   NaN   NaN     NaN   
3645  650000  新疆维吾尔自治区   654200      塔城地区  2004    NaN   NaN   NaN     NaN   
3646  650000  新疆维吾尔自治区   654200      塔城地区  2005    NaN   NaN   NaN     NaN   
3647  650000  新疆维吾尔自治区   654200      塔城地区  2006    NaN   NaN   NaN     NaN   
3648  650000  新疆维吾尔自治区   654200      塔城地区  2007    NaN   NaN   NaN     NaN   
3649  650000  新疆维吾尔自治区   654200      塔城地区  2008    NaN   NaN   NaN     NaN   
3650  650000  新疆维吾尔自治区   654200      塔城地区  2009    NaN   NaN   NaN     NaN   
3651  650000  新疆维吾尔自治区   654200      塔城地区  2010    NaN   NaN   NaN     NaN   
3652  650000  新疆维吾尔自治区   654300     阿勒泰地区  2000    NaN   NaN   NaN     NaN   
3653  650000  新疆维吾尔自治区   654300     阿勒泰地区  2001    NaN   NaN   NaN     NaN   
3654  650000  新疆维吾尔自治区   654300     阿勒泰地区  2002    NaN   NaN   NaN     NaN   
3655  650000  新疆维吾尔自治区   654300     阿勒泰地区  2003    NaN   NaN   NaN     NaN   
3656  650000  新疆维吾尔自治区   654300     阿勒泰地区  2004    NaN   NaN   NaN     NaN   
3657  650000  新疆维吾尔自治区   654300     阿勒泰地区  2005    NaN   NaN   NaN     NaN   
3658  650000  新疆维吾尔自治区   654300     阿勒泰地区  2006    NaN   NaN   NaN     NaN   
3659  650000  新疆维吾尔自治区   654300     阿勒泰地区  2007    NaN   NaN   NaN     NaN   
3660  650000  新疆维吾尔自治区   654300     阿勒泰地区  2008    NaN   NaN   NaN     NaN   
3661  650000  新疆维吾尔自治区   654300     阿勒泰地区  2009    NaN   NaN   NaN     NaN   
3662  650000  新疆维吾尔自治区   654300     阿勒泰地区  2010    NaN   NaN   NaN     NaN   

     籍贯省份名称  ...   民族   教育 是否是党校教育(是=1,否=0) 专业:人文 专业:社科  专业:理工  专业:农科  专业:医科  \
0       NaN  ...  NaN   硕士              1.0   NaN   NaN    NaN    NaN    NaN   
1       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
2       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
3       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
4       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
5       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
6       NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
7       NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
8       NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
9       NaN  ...  NaN   本科              1.0   0.0   1.0    0.0    0.0    0.0   
10      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
11      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
12      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
13      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
14      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
15      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
16      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
17      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
18      NaN  ...  NaN   博士              0.0   0.0   1.0    0.0    0.0    0.0   
19      NaN  ...  NaN   博士              0.0   0.0   1.0    0.0    0.0    0.0   
20      NaN  ...  NaN   博士              0.0   0.0   1.0    0.0    0.0    0.0   
21      NaN  ...  NaN   博士              0.0   0.0   1.0    0.0    0.0    0.0   
22      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
23      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
24      NaN  ...  NaN   本科              0.0   0.0   0.0    1.0    0.0    0.0   
25      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
26      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
27      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
28      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
29      NaN  ...  NaN   硕士              1.0   0.0   1.0    0.0    0.0    0.0   
...     ...  ...  ...  ...              ...   ...   ...    ...    ...    ...   
3633    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3634    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3635    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3636    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3637    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3638    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3639    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3640    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3641    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3642    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3643    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3644    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3645    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3646    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3647    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3648    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3649    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3650    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3651    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3652    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3653    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3654    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3655    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3656    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3657    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3658    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3659    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3660    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3661    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   
3662    NaN  ...  NaN  NaN              NaN   NaN   NaN    NaN    NaN    NaN   

      入党年份  工作年份  
0      NaN   NaN  
1      NaN   NaN  
2      NaN   NaN  
3      NaN   NaN  
4      NaN   NaN  
5      NaN   NaN  
6      NaN   NaN  
7      NaN   NaN  
8      NaN   NaN  
9      NaN   NaN  
10     NaN   NaN  
11     NaN   NaN  
12     NaN   NaN  
13     NaN   NaN  
14     NaN   NaN  
15     NaN   NaN  
16     NaN   NaN  
17     NaN   NaN  
18     NaN   NaN  
19     NaN   NaN  
20     NaN   NaN  
21     NaN   NaN  
22     NaN   NaN  
23     NaN   NaN  
24     NaN   NaN  
25     NaN   NaN  
26     NaN   NaN  
27     NaN   NaN  
28     NaN   NaN  
29     NaN   NaN  
...    ...   ...  
3633   NaN   NaN  
3634   NaN   NaN  
3635   NaN   NaN  
3636   NaN   NaN  
3637   NaN   NaN  
3638   NaN   NaN  
3639   NaN   NaN  
3640   NaN   NaN  
3641   NaN   NaN  
3642   NaN   NaN  
3643   NaN   NaN  
3644   NaN   NaN  
3645   NaN   NaN  
3646   NaN   NaN  
3647   NaN   NaN  
3648   NaN   NaN  
3649   NaN   NaN  
3650   NaN   NaN  
3651   NaN   NaN  
3652   NaN   NaN  
3653   NaN   NaN  
3654   NaN   NaN  
3655   NaN   NaN  
3656   NaN   NaN  
3657   NaN   NaN  
3658   NaN   NaN  
3659   NaN   NaN  
3660   NaN   NaN  
3661   NaN   NaN  
3662   NaN   NaN  

[3663 rows x 23 columns]), ('ESRI_MAPINFO_SHEET', Empty DataFrame
Columns: []
Index: [])])
-----

你可能感兴趣的:((三篇长文让你玩6Pandas)数据分析入门_PART2常用工具包_CH02数据分析工具:Pandas__Part03(统计分析基础))