参考:https://wwa.lanzoul.com/io4Lo00kvkob
字段含义:
import pandas as pd
# 加载数据,查看数据的基本信息
df = pd.read_csv('./usa_election.txt', on_bad_lines='skip')
# 截取指定列,其他数据舍弃
df = df[['cand_nm', 'contbr_nm', 'contbr_st', 'contbr_employer', 'contbr_occupation', 'contb_receipt_amt',
'contb_receipt_dt']]
print(df.head())
print(df.info())
cand_nm contbr_nm ... contb_receipt_amt contb_receipt_dt
0 Bachmann, Michelle HARVEY, WILLIAM ... 250.0 20-JUN-11
1 Bachmann, Michelle HARVEY, WILLIAM ... 50.0 23-JUN-11
2 Bachmann, Michelle SMITH, LANIER ... 250.0 05-JUL-11
3 Bachmann, Michelle BLEVINS, DARONDA ... 250.0 01-AUG-11
4 Bachmann, Michelle WARDENBURG, HAROLD ... 300.0 20-JUN-11
[5 rows x 7 columns]
RangeIndex: 536041 entries, 0 to 536040
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cand_nm 536041 non-null object
1 contbr_nm 536041 non-null object
2 contbr_st 536040 non-null object
3 contbr_employer 525088 non-null object
4 contbr_occupation 530520 non-null object
5 contb_receipt_amt 536041 non-null float64
6 contb_receipt_dt 536041 non-null object
dtypes: float64(1), object(6)
memory usage: 28.6+ MB
None
# 使用 NOT PROVIDE 对空值进行填充
df.fillna(value='NOT PROVIDE', inplace=True)
# 重新查看列是否有空值
print(df.info())
# 异常值处理。将捐款金额<=0的数据删除
df = df.loc[~(df['contb_receipt_amt'] <= 0)]
# 等效于
# df.drop(labels=df.loc[df['contb_receipt_amt'] <= 0].index, axis=0, inplace=True)
print(df.info())
输出:
RangeIndex: 536041 entries, 0 to 536040
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cand_nm 536041 non-null object
1 contbr_nm 536041 non-null object
2 contbr_st 536041 non-null object
3 contbr_employer 536041 non-null object
4 contbr_occupation 536041 non-null object
5 contb_receipt_amt 536041 non-null float64
6 contb_receipt_dt 536041 non-null object
dtypes: float64(1), object(6)
memory usage: 28.6+ MB
None
Int64Index: 530314 entries, 0 to 536040
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cand_nm 530314 non-null object
1 contbr_nm 530314 non-null object
2 contbr_st 530314 non-null object
3 contbr_employer 530314 non-null object
4 contbr_occupation 530314 non-null object
5 contb_receipt_amt 530314 non-null float64
6 contb_receipt_dt 530314 non-null object
dtypes: float64(1), object(6)
memory usage: 32.4+ MB
None
# 不同候选人党派对应表
parties = {
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat',
"Roemer, Charles E. 'Buddy' III": 'Reform',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Libertarian',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican'
}
# 查看共有多少个不同的候选人,返回所有值
num1 = df['cand_nm'].unique()
print(num1)
print("*" * 100)
# 查看候选人的个数,返回所有值的个数
num2 = df['cand_nm'].nunique()
print(num2)
print("*" * 100)
# 利用映射为每个候选人添加党派信息
df['party'] = df['cand_nm'].map(parties)
print(df.head())
print("*" * 100)
# 查看party这一列中有哪些不同的元素
print(df['party'].unique())
print("*" * 100)
# 统计party列中各个元素出现次数
print(df['party'].value_counts())
print("*" * 100)
# 查看各个党派收到的政治献金总数contb_receipt_amt
df_sum = df.groupby(by='party')['contb_receipt_amt'].sum()
print(df_sum)
print("*" * 100)
# 查看具体每天各个党派收到的政治献金总数contb_receipt_amt
df_sum2 = df.groupby(by=['contb_receipt_dt', 'party'])['contb_receipt_amt'].sum()
print(df_sum2)
print("*" * 100)
# 将表中日期格式转换为'yyyy-mm-dd'
months = {"JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6,
"JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12}
def transform_date(d):
day, month, year = d.split("-")
month = months[month]
return '20' + year + '-' + str(month) + '-' + day
df['contb_receipt_dt'] = df['contb_receipt_dt'].map(transform_date)
print(df.head())
print("*" * 100)
# 查看老兵(捐献者职业)DISABLED VETERAN主要支持谁
disabled_veteran_df = df.loc[df['contbr_occupation'] == 'DISABLED VETERAN']
disabled_veteran_support = disabled_veteran_df.groupby(by='cand_nm')['contb_receipt_amt'].sum()
print(disabled_veteran_support)
print("*" * 100)
输出:
['Bachmann, Michelle' 'Romney, Mitt' 'Obama, Barack'
"Roemer, Charles E. 'Buddy' III" 'Pawlenty, Timothy' 'Johnson, Gary Earl'
'Paul, Ron' 'Santorum, Rick' 'Cain, Herman' 'Gingrich, Newt'
'McCotter, Thaddeus G' 'Huntsman, Jon' 'Perry, Rick']
****************************************************************************************************
13
****************************************************************************************************
cand_nm contbr_nm ... contb_receipt_dt party
0 Bachmann, Michelle HARVEY, WILLIAM ... 20-JUN-11 Republican
1 Bachmann, Michelle HARVEY, WILLIAM ... 23-JUN-11 Republican
2 Bachmann, Michelle SMITH, LANIER ... 05-JUL-11 Republican
3 Bachmann, Michelle BLEVINS, DARONDA ... 01-AUG-11 Republican
4 Bachmann, Michelle WARDENBURG, HAROLD ... 20-JUN-11 Republican
[5 rows x 8 columns]
****************************************************************************************************
['Republican' 'Democrat' 'Reform' 'Libertarian']
****************************************************************************************************
Democrat 289999
Republican 234300
Reform 5313
Libertarian 702
Name: party, dtype: int64
****************************************************************************************************
party
Democrat 8.259441e+07
Libertarian 4.132769e+05
Reform 3.429658e+05
Republican 1.251181e+08
Name: contb_receipt_amt, dtype: float64
****************************************************************************************************
contb_receipt_dt party
01-APR-11 Reform 50.00
Republican 12635.00
01-AUG-11 Democrat 182198.00
Libertarian 1000.00
Reform 1847.00
...
31-MAY-11 Republican 313839.80
31-OCT-11 Democrat 216971.87
Libertarian 4250.00
Reform 3205.00
Republican 751542.36
Name: contb_receipt_amt, Length: 1183, dtype: float64
****************************************************************************************************
cand_nm contbr_nm ... contb_receipt_dt party
0 Bachmann, Michelle HARVEY, WILLIAM ... 2011-6-20 Republican
1 Bachmann, Michelle HARVEY, WILLIAM ... 2011-6-23 Republican
2 Bachmann, Michelle SMITH, LANIER ... 2011-7-05 Republican
3 Bachmann, Michelle BLEVINS, DARONDA ... 2011-8-01 Republican
4 Bachmann, Michelle WARDENBURG, HAROLD ... 2011-6-20 Republican
[5 rows x 8 columns]
****************************************************************************************************
cand_nm
Cain, Herman 300.00
Obama, Barack 4205.00
Paul, Ron 2425.49
Santorum, Rick 250.00
Name: contb_receipt_amt, dtype: float64
****************************************************************************************************