#2.1.6 ★Challenge: Summarizing Data.md

1.Introduction to the Data

import pandas as pd
all_ages = pd.read_csv('all-ages.csv') 
recent_grads = pd.read_csv('recent-grads.csv')

2.Summarizing Major Categories

input
# 1.创建空词典的变量
aa_cat_counts = dict()  ## 全年龄工作种类统计
rg_cat_counts = dict()  ## 毕业生工作种类统计

# 2.自定义函数返回词典
def calculate_major_cat_counts(df):    ## df:dataframe
    cats = df['Major_category'].unique()  ## 变量cats是df列'Major_category'返回的unique values
    counts_dictionary = dict()  ## 创建空词典
    
    for cat in cats:
            major_df = df[df['Major_category'] == cat] ## major_df为'Major_category'列等于cat
            total = major_df['Total'].sum() 
            counts_dictionary[cat] = total
    return counts_dictionary

# 3.用自定义函数赋予变量
aa_cat_counts = calculate_major_cat_counts(all_ages)  
rg_cat_counts = calculate_major_cat_counts(recent_grads) 

print(aa_cat_counts)
print('---------------------------')
print(rg_cat_counts)
output
{'Psychology & Social Work': 1987278, 'Biology & Life Science': 1338186, 'Arts': 1805865, 'Education': 4700118, 'Physical Sciences': 1025318, 'Agriculture & Natural Resources': 632437, 'Computers & Mathematics': 1781378, 'Social Science': 2654125, 'Law & Public Policy': 902926, 'Health': 2950859, 'Interdisciplinary': 45199, 'Humanities & Liberal Arts': 3738335, 'Engineering': 3576013, 'Business': 9858741, 'Communications & Journalism': 1803822, 'Industrial Arts & Consumer Services': 1033798} 
---------------------------
 {'Interdisciplinary': 12296, 'Psychology & Social Work': 481007, 'Biology & Life Science': 453862, 'Arts': 357130, 'Education': 559129, 'Agriculture & Natural Resources': 79981, 'Physical Sciences': 185479, 'Industrial Arts & Consumer Services': 229792, 'Law & Public Policy': 179107, 'Health': 463230, 'Computers & Mathematics': 299008, 'Humanities & Liberal Arts': 713468, 'Engineering': 537583, 'Business': 1302376, 'Communications & Journalism': 392601, 'Social Science': 529966}

3.Low-Wage Job Rates

input
low_wage_percent = 0.0
low_wage_jobs_sum = recent_grads['Low_wage_jobs'].sum()
recent_grads_sum = recent_grads['Total'].sum()
low_wage_percent = str(round(low_wage_jobs_sum / recent_grads_sum * 100, 2)) + '%'
print(low_wage_percent)
output
9.85%

4.Comparing Data Sets

input
# All majors, common to both DataFrames
majors = recent_grads['Major'].unique()
rg_lower_count = 0  # 毕业生失业率比全年龄失业率的低的数目

for major in majors:
    recent_grads_row = recent_grads[recent_grads['Major'] == major]
    all_ages_row = all_ages[all_ages['Major'] == major]
    
    rg_unemp_rate = recent_grads_row.iloc[0]['Unemployment_rate'] # 注意iloc前缀是dataframe
    aa_unemp_rate = all_ages_row.iloc[0]['Unemployment_rate']
    
    if rg_unemp_rate < aa_unemp_rate:
        rg_lower_count += 1 # 毕业生失业率比全年龄失业率低,则+1

print(rg_lower_count)
output
43

5.iloc的type

input
recent_grads_row = recent_grads[recent_grads['Major'] == 'CHEMICAL ENGINEERING']
print(type(recent_grads_row))
print('---------------------')
rg_unemp_rate = recent_grads_row.iloc[0]
print(type(rg_unemp_rate))
output

--------------------- 

你可能感兴趣的:(#2.1.6 ★Challenge: Summarizing Data.md)