import pandas as pd
import numpy as np
from datetime import datetime
time_format = "%d%b%Y %H:%M"
datetime.now().strftime(time_format)
'02Apr2020 09:13'
import os
for dirname, _, filenames in os.walk("."):
for filename in filenames:
print(os.path.join(dirname, filename))
.\covid-19-data-with-sir-model.ipynb
.\Thumbs.db
.\Untitled.ipynb
.\人口金字塔.png
.\.ipynb_checkpoints\covid-19-data-with-sir-model-checkpoint.ipynb
.\.ipynb_checkpoints\Untitled-checkpoint.ipynb
.\COVID-19 containment and mitigation measures\COVID 19 Containment measures data.csv
.\covid19_global_forecasting_location\locations_population.csv
.\Novel Corona Virus 2019 Dataset\COVID19_line_list_data.csv
.\Novel Corona Virus 2019 Dataset\COVID19_open_line_list.csv.zip
.\Novel Corona Virus 2019 Dataset\covid_19_data.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_confirmed.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_deaths.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_recovered.csv
Total population
population_raw = pd.read_csv("./covid19_global_forecasting_location/locations_population.csv")
population_raw.head()
|
Province.State |
Country.Region |
Population |
Provenance |
0 |
NaN |
Afghanistan |
35530000 |
NaN |
1 |
NaN |
Albania |
2877000 |
NaN |
2 |
NaN |
Algeria |
41320000 |
NaN |
3 |
NaN |
Andorra |
78000 |
NaN |
4 |
NaN |
Antigua and Barbuda |
102012 |
NaN |
population_raw.info()
RangeIndex: 284 entries, 0 to 283
Data columns (total 4 columns):
Province.State 130 non-null object
Country.Region 284 non-null object
Population 284 non-null int64
Provenance 8 non-null object
dtypes: int64(1), object(3)
memory usage: 5.6+ KB
population_raw.isnull().sum()
Province.State 154
Country.Region 0
Population 0
Provenance 276
dtype: int64
pd.DataFrame(population_raw.isnull().sum()).T
|
Province.State |
Country.Region |
Population |
Provenance |
0 |
154 |
0 |
0 |
276 |
pd.DataFrame(population_raw.notnull().sum()).T
|
Province.State |
Country.Region |
Population |
Provenance |
0 |
130 |
284 |
284 |
8 |
df = population_raw.copy()
df = df.rename({"Province.State": "Province", "Country.Region": "Country"}, axis=1)
cols = ["Country", "Province", "Population"]
df = df.loc[:, cols].fillna("-")
df.loc[df["Country"] == df["Province"], "Province"] = "-"
_total_df = df.loc[df["Province"] != "-", :].groupby("Country").sum()
_total_df = _total_df.reset_index().assign(Province="-")
df = pd.concat([df, _total_df], axis=0, sort=True)
df = df.drop_duplicates(subset=["Country", "Province"], keep="first")
global_value = df.loc[df["Province"] == "-", "Population"].sum()
df = df.append(pd.Series(["Global", "-", global_value], index=cols), ignore_index=True)
china_value = df.loc[(df["Country"] == "China") & (df["Province"] == "-"), "Population"].sum()
df = df.append(pd.Series(["Except China", "-", china_value - global_value], index=cols), ignore_index=True)
df = df.sort_values("Population", ascending=False).reset_index(drop=True)
df = df.loc[:, cols]
population_df = df.copy()
population_df.head()
|
Country |
Province |
Population |
0 |
Global |
- |
7067093478 |
1 |
China |
- |
1376807262 |
2 |
India |
- |
1339000000 |
3 |
US |
- |
327200000 |
4 |
Indonesia |
- |
264000000 |
df = population_df.loc[population_df["Province"] == "-", :]
population_dict = df.set_index("Country").to_dict()["Population"]
_age_bins = [
"0-4", "5-9", "10-14", "15-19", "20-24", "25-29",
"30-34", "35-39", "40-44", "45-49", "50-54", "55-59",
"60-64", "65-69", "70-74", "75-79", "80-84", "85-89",
"90-94", "95-99", "100+"
]
_pyramid_df = pd.DataFrame({"Age_bin": _age_bins})
_pyramid_df
|
Age_bin |
0 |
0-4 |
1 |
5-9 |
2 |
10-14 |
3 |
15-19 |
4 |
20-24 |
5 |
25-29 |
6 |
30-34 |
7 |
35-39 |
8 |
40-44 |
9 |
45-49 |
10 |
50-54 |
11 |
55-59 |
12 |
60-64 |
13 |
65-69 |
14 |
70-74 |
15 |
75-79 |
16 |
80-84 |
17 |
85-89 |
18 |
90-94 |
19 |
95-99 |
20 |
100+ |
_name = "Global"
_male = [
349432556,
342927576,
331497486,
316642222,
308286775,
306059387,
309236984,
276447037,
249389688,
241232876,
222609691,
192215395,
157180267,
128939392,
87185982,
54754941,
33648953,
15756942,
5327866,
1077791,
124144
]
_female = [
328509234,
321511867,
309769906,
295553758,
289100903,
288632766,
296293748,
268371754,
244399176,
238133281,
223162982,
195633743,
164961323,
140704320,
101491347,
69026831,
48281201,
26429329,
11352182,
3055845,
449279
]
_pyramid_df[_name] = np.array(_male) + np.array(_female)
_pyramid_df[_name]
0 677941790
1 664439443
2 641267392
3 612195980
4 597387678
5 594692153
6 605530732
7 544818791
8 493788864
9 479366157
10 445772673
11 387849138
12 322141590
13 269643712
14 188677329
15 123781772
16 81930154
17 42186271
18 16680048
19 4133636
20 573423
Name: Global, dtype: int32