# -*- coding: utf-8 -*-
import pandas as pd
from pandas import DataFrame, Series
fec = pd.read_csv('fec/P00000001-ALL.csv')
#print fec
#print fec.ix[123456]
#加入党派
unique_cands = fec.cand_nm.unique()
#print unique_cands
parties = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican',
}
#.map 对应关系映射值,可以是函数、字典、Sreies
#print fec.cand_nm[12456:123461].map(parties)
fec['party'] = fec.cand_nm.map(parties)
#print fec['party'].value_counts()
#注意有退款
#print (fec.contb_receipt_amt > 0).value_counts()
#简化过程,限定只有正出资
fec = fec[fec.contb_receipt_amt > 0]
#print fec
#Obama和Romney是主要选举人
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]
#print fec_mrbo
#根据职业和雇主统计赞助信息
#算出出资总额
#print fec.contbr_occupation.value_counts()[:10]
occ_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED',
'INFORMATION REQUESTED': 'NOT PROVIDED',
'INFORMATION REQUESTED (BEST EFFORTS)': 'NOT PROVIDED',
'C.E.O.': 'CEO'
}
#如果未提供相关映射,则返回x
f = lambda x: occ_mapping.get(x, x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
#同样处理雇主信息
emp_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS': 'NOT PROVIDED',
'INFORMATION REQUESTED': 'NOT PROVIDED',
'SELF': 'self-EMPLOYED',
'SELF EMPLOYED': 'self-EMPLOYED',
}
#如果未提供相关映射,则返回x
f = lambda x: emp_mapping.get(x, x)
fec.contbr_employer = fec.contbr_employer.map(f)
#对职业和数据进行聚合,过滤掉总额不超过200万美元
by_occupation = fec.pivot_table('contb_receipt_amt',
rows = 'contbr_occupation',
cols = 'party', aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
#print over_2mm
#柱状图
#over_2mm.plot(kind='barh')
#对Obabam和Romney总出资最高的的职业和企业
def get_top_amounts(group, key, n=5):
totals = group.groupby(key)['contb_receipt_amt'].sum()
#根据key对totals进行将序排列
return totals.order(ascending=False)[n:]
#根据职业和雇主进行聚合
grouped = fec_mrbo.groupby('cand_nm')
#print grouped.apply(get_top_amounts, 'contbr_occupation', n=7)
#print grouped.apply(get_top_amounts, 'contbr_employer', n=10)
#对出资额进行分组
bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)
#print labels
#根据候选人姓名以及面元标签对数据进行分组
grouped = fec_mrbo.groupby(['cand_nm', labels])
#print grouped.size().unstack(0)
#数据面元内规格化
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
#print bucket_sums
#.div除法
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
#print normed_sums
#最大面元非个人捐赠,排除
#normed_sums[:-2].plot(kind='barh', stacked=True)
#根据州统计赞助信息
grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1) > 100000]
#print totals[:10]
#赞助比例
percent = totals.div(totals.sum(1), axis=0)
#print percent[:10]