2019-nCOV——人口统计

import pandas as pd
import numpy as np
from datetime import datetime
time_format = "%d%b%Y %H:%M"
datetime.now().strftime(time_format)
'02Apr2020 09:13'
import os
for dirname, _, filenames in os.walk("."):
    for filename in filenames:
        print(os.path.join(dirname, filename))
.\covid-19-data-with-sir-model.ipynb
.\Thumbs.db
.\Untitled.ipynb
.\人口金字塔.png
.\.ipynb_checkpoints\covid-19-data-with-sir-model-checkpoint.ipynb
.\.ipynb_checkpoints\Untitled-checkpoint.ipynb
.\COVID-19 containment and mitigation measures\COVID 19 Containment measures data.csv
.\covid19_global_forecasting_location\locations_population.csv
.\Novel Corona Virus 2019 Dataset\COVID19_line_list_data.csv
.\Novel Corona Virus 2019 Dataset\COVID19_open_line_list.csv.zip
.\Novel Corona Virus 2019 Dataset\covid_19_data.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_confirmed.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_deaths.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_recovered.csv

Total population

population_raw = pd.read_csv("./covid19_global_forecasting_location/locations_population.csv")
population_raw.head()
Province.State Country.Region Population Provenance
0 NaN Afghanistan 35530000 NaN
1 NaN Albania 2877000 NaN
2 NaN Algeria 41320000 NaN
3 NaN Andorra 78000 NaN
4 NaN Antigua and Barbuda 102012 NaN
population_raw.info()

RangeIndex: 284 entries, 0 to 283
Data columns (total 4 columns):
Province.State    130 non-null object
Country.Region    284 non-null object
Population        284 non-null int64
Provenance        8 non-null object
dtypes: int64(1), object(3)
memory usage: 5.6+ KB
population_raw.isnull().sum()
Province.State    154
Country.Region      0
Population          0
Provenance        276
dtype: int64
pd.DataFrame(population_raw.isnull().sum()).T
Province.State Country.Region Population Provenance
0 154 0 0 276
pd.DataFrame(population_raw.notnull().sum()).T
Province.State Country.Region Population Provenance
0 130 284 284 8
df = population_raw.copy()
df = df.rename({"Province.State": "Province", "Country.Region": "Country"}, axis=1)  #改列名
cols = ["Country", "Province", "Population"]
df = df.loc[:, cols].fillna("-")  # 提取某几列,并将NA值用“-”填充
df.loc[df["Country"] == df["Province"], "Province"] = "-"   #类似于Denmark	Denmark这种改为“-”

# Add total records
_total_df = df.loc[df["Province"] != "-", :].groupby("Country").sum()  #groupby会改变索引
_total_df = _total_df.reset_index().assign(Province="-")   #reset_index可以还原索引,重新变为默认的整型索引 ,assign添加新的列或者覆盖原有的列
df = pd.concat([df, _total_df], axis=0, sort=True)
df = df.drop_duplicates(subset=["Country", "Province"], keep="first")   #drop_duplicates去重,保留第一个

# Global
global_value = df.loc[df["Province"] == "-", "Population"].sum()
df = df.append(pd.Series(["Global", "-", global_value], index=cols), ignore_index=True)  #df.append加一行数据

# Global except China
china_value = df.loc[(df["Country"] == "China") & (df["Province"] == "-"), "Population"].sum()
df = df.append(pd.Series(["Except China", "-", china_value - global_value], index=cols), ignore_index=True)

# Sorting
df = df.sort_values("Population", ascending=False).reset_index(drop=True)
df = df.loc[:, cols]
population_df = df.copy()
population_df.head()
Country Province Population
0 Global - 7067093478
1 China - 1376807262
2 India - 1339000000
3 US - 327200000
4 Indonesia - 264000000
df = population_df.loc[population_df["Province"] == "-", :]
population_dict = df.set_index("Country").to_dict()["Population"]  #把两列变成键和值,形成字典
_age_bins = [
    "0-4", "5-9", "10-14", "15-19", "20-24", "25-29",
    "30-34", "35-39", "40-44", "45-49", "50-54", "55-59",
    "60-64", "65-69", "70-74", "75-79", "80-84", "85-89",
    "90-94", "95-99", "100+"
]
_pyramid_df = pd.DataFrame({"Age_bin": _age_bins})
_pyramid_df
Age_bin
0 0-4
1 5-9
2 10-14
3 15-19
4 20-24
5 25-29
6 30-34
7 35-39
8 40-44
9 45-49
10 50-54
11 55-59
12 60-64
13 65-69
14 70-74
15 75-79
16 80-84
17 85-89
18 90-94
19 95-99
20 100+
# Global (WORLD)
_name = "Global"
_male = [
    349432556,
    342927576,
    331497486,
    316642222,
    308286775,
    306059387,
    309236984,
    276447037,
    249389688,
    241232876,
    222609691,
    192215395,
    157180267,
    128939392,
    87185982,
    54754941,
    33648953,
    15756942,
    5327866,
    1077791,
    124144
]
_female = [
    328509234,
    321511867,
    309769906,
    295553758,
    289100903,
    288632766,
    296293748,
    268371754,
    244399176,
    238133281,
    223162982,
    195633743,
    164961323,
    140704320,
    101491347,
    69026831,
    48281201,
    26429329,
    11352182,
    3055845,
    449279
]
_pyramid_df[_name] = np.array(_male) + np.array(_female)
_pyramid_df[_name] 
0     677941790
1     664439443
2     641267392
3     612195980
4     597387678
5     594692153
6     605530732
7     544818791
8     493788864
9     479366157
10    445772673
11    387849138
12    322141590
13    269643712
14    188677329
15    123781772
16     81930154
17     42186271
18     16680048
19      4133636
20       573423
Name: Global, dtype: int32

你可能感兴趣的:(机器学习)