数据分析:2012联邦选举委员会数据库

# -*- coding: utf-8 -*-

import pandas as pd
from pandas import DataFrame, Series

fec = pd.read_csv('fec/P00000001-ALL.csv')
#print fec
#print fec.ix[123456]

#加入党派
unique_cands = fec.cand_nm.unique()
#print unique_cands

parties = {'Bachmann, Michelle': 'Republican',
                  'Cain, Herman': 'Republican',
                  'Gingrich, Newt': 'Republican',
                  'Huntsman, Jon': 'Republican',
                  'Johnson, Gary Earl': 'Republican',
                  'McCotter, Thaddeus G': 'Republican',
                  'Obama, Barack': 'Democrat',
                  'Paul, Ron': 'Republican',
                  'Pawlenty, Timothy': 'Republican',
                  'Perry, Rick': 'Republican',
                  "Roemer, Charles E. 'Buddy' III": 'Republican',
                  'Romney, Mitt': 'Republican',
                  'Santorum, Rick': 'Republican',
           }
#.map 对应关系映射值,可以是函数、字典、Sreies
#print fec.cand_nm[12456:123461].map(parties)
fec['party'] = fec.cand_nm.map(parties)
#print fec['party'].value_counts()

#注意有退款
#print (fec.contb_receipt_amt > 0).value_counts()

#简化过程,限定只有正出资
fec = fec[fec.contb_receipt_amt > 0]
#print fec

#Obama和Romney是主要选举人
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]
#print fec_mrbo


#根据职业和雇主统计赞助信息
#算出出资总额
#print fec.contbr_occupation.value_counts()[:10]
occ_mapping = {
        'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED',
        'INFORMATION REQUESTED': 'NOT PROVIDED',
        'INFORMATION REQUESTED (BEST EFFORTS)': 'NOT PROVIDED',
        'C.E.O.': 'CEO'
}
#如果未提供相关映射,则返回x
f = lambda x: occ_mapping.get(x, x)
fec.contbr_occupation = fec.contbr_occupation.map(f)

#同样处理雇主信息
emp_mapping = {
        'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED',
        'INFORMATION REQUESTED': 'NOT PROVIDED',
        'SELF': 'self-EMPLOYED',
        'SELF EMPLOYED': 'self-EMPLOYED',
}
#如果未提供相关映射,则返回x
f = lambda x: emp_mapping.get(x, x)
fec.contbr_employer = fec.contbr_employer.map(f)

#对职业和数据进行聚合,过滤掉总额不超过200万美元
by_occupation = fec.pivot_table('contb_receipt_amt',
                                rows = 'contbr_occupation',
                                cols = 'party', aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
#print over_2mm

#柱状图
#over_2mm.plot(kind='barh')

#对Obabam和Romney总出资最高的的职业和企业
def get_top_amounts(group, key, n=5):
    totals = group.groupby(key)['contb_receipt_amt'].sum()
    
    #根据key对totals进行将序排列
    return totals.order(ascending=False)[n:]
    
#根据职业和雇主进行聚合
grouped = fec_mrbo.groupby('cand_nm')
#print grouped.apply(get_top_amounts, 'contbr_occupation', n=7)
#print grouped.apply(get_top_amounts, 'contbr_employer', n=10)

#对出资额进行分组
bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)
#print labels

#根据候选人姓名以及面元标签对数据进行分组
grouped = fec_mrbo.groupby(['cand_nm', labels])
#print grouped.size().unstack(0)

#数据面元内规格化
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
#print bucket_sums
#.div除法
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
#print normed_sums

#最大面元非个人捐赠,排除
#normed_sums[:-2].plot(kind='barh', stacked=True)

#根据州统计赞助信息
grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1) > 100000]
#print totals[:10]

#赞助比例
percent = totals.div(totals.sum(1), axis=0)
#print percent[:10]

你可能感兴趣的:(数据分析:2012联邦选举委员会数据库)