参考:https://github.com/jakevdp/data-USstates
import pandas as pd
# 导入文件,查看原始数据
# state 州的全称,abbreviation 州的简称
abb = pd.read_csv("./state-abbrevs.csv")
# state 州的全称,area 占地面积
area = pd.read_csv("./state-areas.csv")
# state/region 州的简称,ages 年龄,year 时间,population 人数
pop = pd.read_csv("./state-population.csv")
# 将人口数据和各州简称数据进行合并(为了保证数据的完整性,采用外连接的方式)
abb_pop = pd.merge(abb, pop, left_on="abbreviation", right_on="state/region", how="outer")
print(abb_pop.head(5))
# 对合并后的数据删除重复列:abbreviation 和 state/region 等效
abb_pop.drop(labels="abbreviation", axis=1, inplace=True)
print(abb_pop.head(5))
输出:
state abbreviation state/region ages year population
0 Alabama AL AL under18 2012 1117489.0
1 Alabama AL AL total 2012 4817528.0
2 Alabama AL AL under18 2010 1130966.0
3 Alabama AL AL total 2010 4785570.0
4 Alabama AL AL under18 2011 1125763.0
state state/region ages year population
0 Alabama AL under18 2012 1117489.0
1 Alabama AL total 2012 4817528.0
2 Alabama AL under18 2010 1130966.0
3 Alabama AL total 2010 4785570.0
4 Alabama AL under18 2011 1125763.0
# 查看空值
print(abb_pop.isnull().any(axis=0))
print(abb_pop.info())
# 定位 state 为空的行
state_null_series = abb_pop.loc[abb_pop['state'].isnull()]['state/region']
# 查看去重后的 state
print(state_null_series.unique())
# USA 的全称对应的空值进行人工批量填充
usa_indexes = abb_pop.loc[abb_pop['state/region'] == 'USA'].index
abb_pop.loc[usa_indexes, 'state'] = 'United States'
print(abb_pop.loc[abb_pop['state/region'] == 'USA'].head(5))
# PR 的全称对应的空值进行人工批量填充
pr_indexes = abb_pop.loc[abb_pop['state/region'] == 'PR'].index
abb_pop.loc[pr_indexes, 'state'] = 'Puerto Rico'
print(abb_pop.loc[abb_pop['state/region'] == 'PR'].head(5))
输出:
state True
state/region False
ages False
year False
population True
dtype: bool
Int64Index: 2544 entries, 0 to 2543
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 state 2448 non-null object
1 state/region 2544 non-null object
2 ages 2544 non-null object
3 year 2544 non-null int64
4 population 2524 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 119.2+ KB
None
['PR' 'USA']
state state/region ages year population
2496 United States USA under18 1990 64218512.0
2497 United States USA total 1990 249622814.0
2498 United States USA total 1991 252980942.0
2499 United States USA under18 1991 65313018.0
2500 United States USA under18 1992 66509177.0
state state/region ages year population
2448 Puerto Rico PR under18 1990 NaN
2449 Puerto Rico PR total 1990 NaN
2450 Puerto Rico PR total 1991 NaN
2451 Puerto Rico PR under18 1991 NaN
2452 Puerto Rico PR total 1993 NaN
# 合并各州面积数据areas
abb_pop_area = pd.merge(abb_pop, area, left_on='state', right_on='state', how='outer')
# 去掉 area 缺失的行
abb_pop_area_null_indexes = abb_pop_area[abb_pop_area['area (sq. mi)'].isnull()].index
print(abb_pop_area.iloc[abb_pop_area_null_indexes].head(5))
abb_pop_area.drop(labels=abb_pop_area_null_indexes, axis=0, inplace=True)
# 找出2010年的全民人口数据
print(abb_pop_area.query('ages == "total" & year == 2010').head(5))
# 计算各州的人口密度(人口除以面积)
abb_pop_area['density'] = abb_pop_area['population'] / abb_pop_area['area (sq. mi)']
print(abb_pop_area.head(5))
# 排序,并找出人口密度最高的州
most_density_state_str = abb_pop_area.sort_values(by='density', axis=0, ascending=False).iloc[0]['state']
print(most_density_state_str)
输出:
state state/region ages year population area (sq. mi)
2496 United States USA under18 1990 64218512.0 NaN
2497 United States USA total 1990 249622814.0 NaN
2498 United States USA total 1991 252980942.0 NaN
2499 United States USA under18 1991 65313018.0 NaN
2500 United States USA under18 1992 66509177.0 NaN
state state/region ages year population area (sq. mi)
3 Alabama AL total 2010 4785570.0 52423.0
91 Alaska AK total 2010 713868.0 656425.0
101 Arizona AZ total 2010 6408790.0 114006.0
189 Arkansas AR total 2010 2922280.0 53182.0
197 California CA total 2010 37333601.0 163707.0
state state/region ages year population area (sq. mi) density
0 Alabama AL under18 2012 1117489.0 52423.0 21.316769
1 Alabama AL total 2012 4817528.0 52423.0 91.897221
2 Alabama AL under18 2010 1130966.0 52423.0 21.573851
3 Alabama AL total 2010 4785570.0 52423.0 91.287603
4 Alabama AL under18 2011 1125763.0 52423.0 21.474601
District of Columbia