#import相关的库
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('datasets/fec/P00000001-ALL.csv')
df.head()
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num
0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 250.0 20-JUN-11 NaN NaN NaN SA17A 736166
1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 50.0 23-JUN-11 NaN NaN NaN SA17A 736166
2 C00410118 P20002978 Bachmann, Michelle SMITH, LANIER LANETT AL 3.68633e+08 INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11 NaN NaN NaN SA17A 749073
3 C00410118 P20002978 Bachmann, Michelle BLEVINS, DARONDA PIGGOTT AR 7.24548e+08 NONE RETIRED 250.0 01-AUG-11 NaN NaN NaN SA17A 749073
4 C00410118 P20002978 Bachmann, Michelle WARDENBURG, HAROLD HOT SPRINGS NATION AR 7.19016e+08 NONE RETIRED 300.0 20-JUN-11 NaN NaN NaN SA17A 736166
取几个典型特征分析
df1 = df[["cand_nm","contbr_nm","contbr_st","contbr_employer","contbr_occupation","contb_receipt_amt","contb_receipt_dt"]]
df1.head()
cand_nm contbr_nm contbr_st contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt
0 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 250.0 20-JUN-11
1 Bachmann, Michelle HARVEY, WILLIAM AL RETIRED RETIRED 50.0 23-JUN-11
2 Bachmann, Michelle SMITH, LANIER AL INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11
3 Bachmann, Michelle BLEVINS, DARONDA AR NONE RETIRED 250.0 01-AUG-11
4 Bachmann, Michelle WARDENBURG, HAROLD AR NONE RETIRED 300.0 20-JUN-11
简单总览数据情况
#查看数据框形状
df1.shape
#查看数据的信息,包括每个字段的名称、非空数量、字段的数据类型
df1.info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001731 entries, 0 to 1001730
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cand_nm 1001731 non-null object
1 contbr_nm 1001731 non-null object
2 contbr_st 1001727 non-null object
3 contbr_employer 988002 non-null object
4 contbr_occupation 993301 non-null object
5 contb_receipt_amt 1001731 non-null float64
6 contb_receipt_dt 1001731 non-null object
dtypes: float64(1), object(6)
各字段含义
cand_nm – 接受捐赠的候选人姓名
contbr_nm – 捐赠人姓名
contbr_st – 捐赠人所在州
contbr_employer – 捐赠人所在公司
contbr_occupation – 捐赠人职业
contb_receipt_amt – 捐赠数额(美元)
contb_receipt_dt – 收到捐款的日期
#对数据进行统计性分析
df1.describe
#从data.info()得知,contbr_employer、contbr_occupation均有少量缺失值,均填充为NOT PROVIDED
data['contbr_employer'].fillna('NOT PROVIDED',inplace=True)
data['contbr_occupation'].fillna('NOT PROVIDED',inplace=True)
针对"cand_nm 候选人姓名"这一条特征,我们查看他的所有名单(pd.Series.value_counts()),
并用字典标明是属于共和党还是民主党
df1["cand_nm"].value_counts() #显示这一特征下不同类别的数量
Obama, Barack 593746
Paul, Ron 143757
Romney, Mitt 107229
Gingrich, Newt 47679
Santorum, Rick 46559
Cain, Herman 20107
Perry, Rick 13575
Bachmann, Michelle 13140
Roemer, Charles E. 'Buddy' III 5920
Pawlenty, Timothy 4555
Huntsman, Jon 4156
Johnson, Gary Earl 1234
McCotter, Thaddeus G 74
Name: cand_nm, dtype: int64
df['cand_nm'].unique() 也可以查看这一列的不同值,返回值为一维数组
#建立字典标明对应党派
parties = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'}
#通过pd.Series.map()函数添加一列存储党派信息
df1["party"] = df1["cand_nm"].map(parties)
df1.groupby('party').sum() #查看不同党派贡献金额
contb_receipt_amt
party
Democrat 1.335026e+08
Republican 1.652488e+08
#查看两党得票数
df1['party'].value_counts()
Democrat 593746
Republican 407985
Name: party, dtype: int64
发现Republican(共和党)接受的赞助总金额更高,Democrat(民主党)获得的赞助次数更多一些
#为了简化分析,我们将范围控制在contb_receipt_amt>0中
df2= df1[df1["contb_receipt_amt"]>0]
由于奥巴马和罗姆尼为两个主要候选人,因此准备一个仅对他们有贡献的子集
df_lmab = df2[df2['cand_nm'].isin(['Obama, Barack','Romney, Mitt'])]
排序:按照职业汇总对赞助总金额进行排序
根据职业分析捐赠是一个常见的统计分析:律师倾向于捐更多的钱给民主党,商务人士更偏向与共和党
#DataFrame.sort_values(by, ascending=True, inplace=False)
by是根据哪一列进行排序,可以传入多列;ascending=True是升序排序,False为降序;inplace=Ture则是修改原dataframe,默认为False
df2.groupby('contbr_occupation')['contb_receipt_amt'].sum().sort_values(ascending=False)[:20]
利用函数进行数据转换:职业与雇主信息分析
利用了dict.get它允许没有映射关系的职业也能“通过”)
#建立一个职业对应字典,把相同职业的不同表达映射为对应的职业,比如把C.E.O.映射为CEO
occupation_map = {
'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'SELF' : 'SELF-EMPLOYED',
'SELF EMPLOYED' : 'SELF-EMPLOYED',
'C.E.O.':'CEO',
'LAWYER':'ATTORNEY',
}
# 如果不在字典中,返回x
f = lambda x: occupation_map.get(x, x)
df2.contbr_occupation = df2.contbr_occupation.map(f)
对雇主的捐献进行同样处理
emp_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'INFORMATION REQUESTED' : 'NOT PROVIDED',
'SELF' : 'SELF-EMPLOYED',
'SELF EMPLOYED' : 'SELF-EMPLOYED',
}
# If no mapping provided, return x
f = lambda x: emp_mapping.get(x, x)
df2.contbr_employer = df2.contbr_employer.map(f)
利用cut函数根据出资额大小将数据离散化到多个面元(桶)中
把之前筛选的奥巴马和洛尔尼的子集中的捐赠金额进行处理
bins = np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels = pd.cut(df_lmab ['contb_receipt_amt'],bins)
Groupby即分组运算,其过程可概括为“split-apply-combine”(拆分-应用-合并)。即分组后对各部分进行运算
拆分的对象为pandas对象(Series、DataFrame等);拆分的依据是分组键,可以是列表、数组(长度与待分组的轴一样)、字典、Series、函数、DataFrame列名
透视表(pivot_table)分析党派和职业
#按照党派、职业对赞助金额进行汇总,类似excel中的透视表操作,聚合函数为sum
by_occupation = data.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')
#过滤掉赞助金额小于200W的数据
over_2mm = by_occupation[by_occupation.sum(1)>2000000]
数据聚合(aggregate)
数据聚合,即任何能从数组产生标量值的数据转换过程。如mean、count、min、sum等,此外可以自定义聚合函数,或是已经定义好的任何方法。Groupby方法后的聚合,是在分组对象上调用聚合方法,再进行汇总。
分组级运算及转换(transform和apply)
#来了解一下对Obama和Romney总出资最高的职业和雇主
def groupby_again(group,key, n = 2):
totals = group.groupby(key).sum()
return totals.sort_values( by ='contb_receipt_amt',ascending=False)[:n]
groupbyed = df_lmab.groupby('cand_nm')
groupbyed.apply(groupby_again,'contbr_occupation',n=7)
#同样的,使用get_top_amounts()对雇主进行分析处理
grouped.apply(groupby_again,'contbr_employer',n=10)
#来了解一下对两位候选人资助频次最高的人
def paixu(group,n=2):
top_counts = group["contbr_nm"].value_counts()
top_final = top_counts.sort_values(ascending = False)[:n]
return top_final
groupeed = df_lmab.groupby("cand_nm")
groupeed.apply(paixu,n=5)
对赞助金额进行分组统计
首先统计各出资区间的赞助笔数,这里用到unstack(),stack()函数是堆叠,unstack()函数就是不要堆叠,即把多层索引变为表格数据但是数据类型仍为DataFrame
df_lmab.groupby(["cand_nm",labels]).size()
#输出
cand_nm contb_receipt_amt
Obama, Barack (0, 1] 493
(1, 10] 40070
(10, 100] 372280
(100, 1000] 153991
(1000, 10000] 22284
(10000, 100000] 2
(100000, 1000000] 3
(1000000, 10000000] 4
Romney, Mitt (0, 1] 77
(1, 10] 3681
(10, 100] 31853
(100, 1000] 43357
(1000, 10000] 26186
(10000, 100000] 1
(100000, 1000000] 0
(1000000, 10000000] 0
dtype: int64
df_lmab.groupby(["cand_nm",labels]).size().unstack(0)
#输出
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 493 77
(1, 10] 40070 3681
(10, 100] 372280 31853
(100, 1000] 153991 43357
(1000, 10000] 22284 26186
(10000, 100000] 2 1
(100000, 1000000] 3 0
(1000000, 10000000] 4 0
统计各区间的赞助金额
grouped_bins = df_lmab.groupby(["cand_nm",labels])
bucket_sums = grouped_bins['contb_receipt_amt'].sum().unstack(0)
bucket_sums
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 318.24 77.00
(1, 10] 337267.62 29819.66
(10, 100] 20288981.41 1987783.76
(100, 1000] 54798531.46 22363381.69
(1000, 10000] 51753705.67 63942145.42
(10000, 100000] 59100.00 12700.00
(100000, 1000000] 1490683.08 NaN
(1000000, 10000000] 7148839.76 NaN
对赞助金额进行可视化
bucket_sums.plot(kind='bar')
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums[:-2].plot(kind = "barh)
df2['time'] = pd.to_datetime(df2['contb_receipt_dt'])
重新设置索引为time
daf2.set_index('time',inplace=True)
pandas对象都拥有resample方法,该方法是所有频率转换的工具函数。resample拥有类似groupby的API可以调用resample对数据分组,之后再调用聚合函数
vs_time = df2.groupby('cand_nm').resample('M')['cand_nm'].count()
vs_time.unstack(0) #把层索引转变为表格数据
参考链接
2012美国大选数据集