%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
#读取数据
contd1 = pd.read_csv('./datas/contb_01.csv')
contd2 = pd.read_csv('./datas/contb_02.csv')
contd3 = pd.read_csv('./datas/contb_03.csv')
contd1.shape
contd2.shape
contd3.shape
#先查看下期中一个数据的内容,拿前五个
contd1.head()
contd2.head()
contd3.head()
#在行的方向进行合并
contb = pd.concat([contd1,contd2,contd3],axis=0)
contb.shape
contb
#info(查看索引、数据类型和内存信息)
contb.info()
#也可以通过discribe()来查看详细的情况
contb.describe()
contb.info()
#isnull或者notnull然后作为条件进行过滤
cond = contb['contbr_employer'].isnull()
contb[cond]
#使用fillna方法填充固定值
contb['contbr_employer'].fillna('NOT PROVIDE',inplace = True)
contb.info()
#对属性contbr_st/contbr_occupation职业进行数据填充
contb['contbr_st'].fillna('NOT PROVIDE',inplace = True)
contb['contbr_occupation'].fillna('NOT PROVIDE',inplace = True)
contb.info()
contb
#字典映射进行转换:党派分析,建立字典parties,候选人名字作为键,所属党派作为对应的值
parties = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'}
#查看下有多少个候选人的名单,并去掉重复
contb.cand_nm.unique()
#和上面的数据保持一致,把政党给加上,直接使用一个映射就可以
#可以使用map,一般传一个函数
contb['party'] = contb['cand_nm'].map(parties)
contb.head()
#根据政党,民众的选取支持数
contb['party'].value_counts()
contb.columns
#根据政党来进行一个分组,分析下每个政党的捐献率,总金额
contb.groupby('party')['contb_receipt_amt'].sum()
#按照职业进行分组聚合,统计下每种职业捐献金额的多少
grouped_occupation = contb.groupby(['contbr_occupation'])['contb_receipt_amt'].sum()
grouped_occupation
#对前20进行排序
grouped_occupation.sort_values(ascending=False)[:20]
# 按照公司进行统计,汇总
grouped_occupation = contb.groupby(['contbr_employer'])['contb_receipt_amt'].sum()
grouped_occupation
grouped_occupation.sort_values(ascending=False)[:20]
# 过滤金额小于0的人数
contb_ = contb[contb['contb_receipt_amt'] > 0]
contb_.shape
#查看各候选人获得的赞助总金额
cand_na_amt = contb.groupby(['cand_nm'])['contb_receipt_amt'].sum().sort_values(ascending=False)
cand_na_amt
cand_na_amt.plot(kind='bar')
plt.figure(figsize=(15,15))
cand_na_amt.plot(kind='pie')
cond1 = contb_['cand_nm'] == 'Obama, Barack'
cond2 = contb_['cand_nm'] == 'Romney, Mitt'
cond = cond1 | cond2
contb_vs = contb_[cond]
contb_vs
contb_.query("cand_nm == 'Obama, Barack' or cand_nm == 'Romney, Mitt'")
contb.columns
# cut
bins = [0,1,10,100,1000,10000,100000,1000000]#金额
labels = pd.cut(contb_vs['contb_receipt_amt'],bins)
contb_vs['labels'] = labels
contb_vs.head()
#按照党派、职业对赞助金额进行汇总
pivot_sum = contb_.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum',fill_value=0)
#过滤掉赞助金额小于200W的数据
pivot_sum = pivot_sum[pivot_sum.sum(axis = 1)>2000000]
pivot_sum
#可以使用多类型的柱状图,观察200W以上金额对两个党派的支持力度
plt.figure(figsize=(12,6))
ax = plt.subplot(1,1,1)
pivot_sum.plot(kind='bar',ax=ax)
#根据职业与雇主信息分组运算统计候选人被支持情况
#参数二是继续分组的key
#参数三是返回前多少个结果
#定义函数get_top_amounts()对两个字段进行分析处理
def get_top_amounts(grouped,key,n):
return grouped.groupby(key)['contb_receipt_amt'].sum().sort_values(ascending = False)[:n]
# 职业对各候选人献金影响
grouped = contb_vs.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',8).unstack(level=0)
# 统计各区间Obama、Romney接收捐赠次数
amt_vs = contb_vs.groupby(['cand_nm','labels']).size().unstack(level= 0,fill_value = 0)
amt_vs
# 个人捐款(捐献额大于100000)
amt_vs[:-2].plot(kind='bar')
#算出每个区间两位候选人收到赞助总金额的占比
amt_vs_percent = amt_vs.div(amt_vs.sum(axis = 1),axis = 0)
# 百分比堆积图
amt_vs_percent[:-2].plot(kind='bar',stacked = True)
参考资料:七月在线