import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick',
'Paul, Ron', 'Gingrich, Newt']
parties = {
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat',
"Roemer, Charles E. 'Buddy' III": 'Reform',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Libertarian',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican'
}
#low_memory : boolean, default True
#分块加载到内存,再低内存消耗中解析。但是可能出现类型混淆。
#确保类型不被混淆需要设置为False。或者使用dtype 参数指定类型。
#注意使用chunksize 或者iterator 参数分块读入会将整个文件读入到一个Dataframe,
#而忽略类型(只能在C解析器中有效)
ele = pd.read_csv('./usa_election.csv',low_memory=False)
ele.shape
Out: (536041, 16)
ele.head()
ele['party'] = ele['cand_nm'].map(parties)
ele.head()
# 4政党
ele['party'].unique()
Out:
array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)
# 53万多条数据,募捐数据中,政党出现的次数不一样
ele['party'].value_counts()
Out:
Democrat 292400
Republican 237575
Reform 5364
Libertarian 702
Name: party, dtype: int64
ele.columns
Out:
Index(['cmte_id', 'cand_id', 'cand_nm', 'contbr_nm', 'contbr_city',
'contbr_st', 'contbr_zip', 'contbr_employer', 'contbr_occupation',
'contb_receipt_amt', 'contb_receipt_dt', 'receipt_desc', 'memo_cd',
'memo_text', 'form_tp', 'file_num', 'party'],
dtype='object')
ele.groupby(['party'])['contb_receipt_amt'].sum()
Out:
party
Democrat 8.105758e+07
Libertarian 4.132769e+05
Reform 3.390338e+05
Republican 1.192255e+08
Name: contb_receipt_amt, dtype: float6
le['contb_receipt_dt'].unique().size
Out: 376
ele.groupby(['party','contb_receipt_dt'])['contb_receipt_amt'].apply(sum)
ele.dtypes
Out:
cmte_id object
cand_id object
cand_nm object
contbr_nm object
contbr_city object
contbr_st object
contbr_zip object
contbr_employer object
contbr_occupation object
contb_receipt_amt float64
contb_receipt_dt object
receipt_desc object
memo_cd object
memo_text object
form_tp object
file_num int64
party object
dtype: object
months
Out:
{'APR': 4,
'AUG': 8,
'DEC': 12,
'FEB': 2,
'JAN': 1,
'JUL': 7,
'JUN': 6,
'MAR': 3,
'MAY': 5,
'NOV': 11,
'OCT': 10,
'SEP': 9}
# 定义一个函数,按照规定格式输出
def convert(x):
# 01-JAN-12
day,m,year = x.split('-')
month = months[m]
# 2012-1-01
return '20'+year+'-' + str(month) +'-'+day
ele['contb_receipt_dt'] = ele['contb_receipt_dt'].map(convert)
# 使用pd.to_datetime进行时间格式的转换
ele['contb_receipt_dt'] = pd.to_datetime(ele['contb_receipt_dt'])
# 查看是否转换成功
ele.dtypes
Out:【只截取时间字段】 contb_receipt_dt datetime64[ns]
ele.head()
ele2 = ele.groupby(['party','contb_receipt_dt'])['contb_receipt_amt'].sum()
ele2
ele2.unstack()
ele3 = ele2.unstack(level = 0,fill_value=0)
ele3
plot = ele3.cumsum().plot()
fig = plot.get_figure()
fig.set_size_inches(12,9)
ele2.unstack(level = -1)
ele5 = ele.groupby(['cand_nm','contbr_occupation'])['contb_receipt_amt'].sum()
ele5
ele['cand_nm'].unique()
Out:
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
'Huntsman, Jon', 'Perry, Rick'], dtype=object)
ele5['Obama, Barack']
Out:
ele5.index
Out:
MultiIndex(levels=[['Bachmann, Michelle', 'Cain, Herman', 'Gingrich, Newt', 'Huntsman, Jon', 'Johnson, Gary Earl', 'McCotter, Thaddeus G', 'Obama, Barack', 'Paul, Ron', 'Pawlenty, Timothy', 'Perry, Rick', 'Roemer, Charles E. 'Buddy' III', 'Romney, Mitt', 'Santorum, Rick'], [' MIXED-MEDIA ARTIST / STORYTELLER', ' AREA VICE PRESIDENT', ' RESEARCH ASSOCIATE', ' TEACHER', ' THERAPIST', ………………略
ele5.loc[:,'DISABLED VETERAN'] == ele5[:,'DISABLED VETERAN']
Out:
cand_nm
Cain, Herman 300.00
Obama, Barack 4205.00
Paul, Ron 2425.49
Santorum, Rick 250.00
Name: contb_receipt_amt, dtype: float64
ele5[:,'LAWYER']
Out:
cand_nm
Bachmann, Michelle 8318.00
Cain, Herman 3850.00
Gingrich, Newt 47005.00
Huntsman, Jon 49263.00
McCotter, Thaddeus G 500.00
Obama, Barack 1974727.92
Paul, Ron 56209.87
Pawlenty, Timothy 58025.00
Perry, Rick 86505.00
Romney, Mitt 7225.00
Santorum, Rick 14207.00
Name: contb_receipt_amt, dtype: float64
ele5[:,'ATTORNEY']
Out:
cand_nm
Bachmann, Michelle 46214.00
Cain, Herman 76472.87
Gingrich, Newt 205577.00
Huntsman, Jon 143532.50
Johnson, Gary Earl 9425.00
McCotter, Thaddeus G 500.00
Obama, Barack 7112343.35
Paul, Ron 195712.11
Pawlenty, Timothy 238331.10
Perry, Rick 768778.80
Roemer, Charles E. 'Buddy' III 14186.00
Romney, Mitt 3662610.21
Santorum, Rick 107130.00
Name: contb_receipt_amt, dtype: float64
ele5.reset_index()
ele.groupby(['cand_nm'])['contb_receipt_amt'].max()
Out:
cand_nm
Bachmann, Michelle 3022.00
Cain, Herman 10000.00
Gingrich, Newt 5100.00
Huntsman, Jon 5000.00
Johnson, Gary Earl 2500.00
McCotter, Thaddeus G 4000.00
Obama, Barack 1944042.43
Paul, Ron 5000.00
Pawlenty, Timothy 10000.00
Perry, Rick 10000.00
Roemer, Charles E. 'Buddy' III 200.00
Romney, Mitt 12700.00
Santorum, Rick 5000.00
Name: contb_receipt_amt, dtype: float64
ele.query("cand_nm =='Obama, Barack' and contb_receipt_amt == 1944042.43")