import pandas as pd
import numpy as np

load the data from the covid dataset to see what countries we need

df_hopkins = pd.read_csv('data/confirmed.csv')

Load the data for population from the UN/Worldbank.
We must skip non csv stuff in the first 4 rows:

df_world_bank = pd.read_csv('data/wpp_population.csv', skiprows=4)
# take a look at the data
df_world_bank.head()
Country Name Country Code Indicator Name Indicator Code 1960 1961 1962 1963 1964 1965 ... 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
0 Aruba ABW Population, total SP.POP.TOTL 54211.0 55438.0 56225.0 56695.0 57032.0 57360.0 ... 101669.0 102046.0 102560.0 103159.0 103774.0 104341.0 104872.0 105366.0 105845.0 NaN
1 Afghanistan AFG Population, total SP.POP.TOTL 8996973.0 9169410.0 9351441.0 9543205.0 9744781.0 9956320.0 ... 29185507.0 30117413.0 31161376.0 32269589.0 33370794.0 34413603.0 35383128.0 36296400.0 37172386.0 NaN
2 Angola AGO Population, total SP.POP.TOTL 5454933.0 5531472.0 5608539.0 5679458.0 5735044.0 5770570.0 ... 23356246.0 24220661.0 25107931.0 26015780.0 26941779.0 27884381.0 28842484.0 29816748.0 30809762.0 NaN
3 Albania ALB Population, total SP.POP.TOTL 1608800.0 1659800.0 1711319.0 1762621.0 1814135.0 1864791.0 ... 2913021.0 2905195.0 2900401.0 2895092.0 2889104.0 2880703.0 2876101.0 2873457.0 2866376.0 NaN
4 Andorra AND Population, total SP.POP.TOTL 13411.0 14375.0 15370.0 16412.0 17469.0 18549.0 ... 84449.0 83747.0 82427.0 80774.0 79213.0 78011.0 77297.0 77001.0 77006.0 NaN

5 rows × 64 columns

# list with columns which contain population data
df_world_bank_pop_columns = df_world_bank.columns[4:]
df_world_bank_pop_columns
Index(['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019'],
      dtype='object')

We are interested in unique countries only:

hk_countries = set(df_hopkins['Country/Region'].unique())
wb_countries = set(df_world_bank['Country Name'].unique())

With python sets we can subtract from each other to find the missing countries:

hk_countries - wb_countries
{'Bahamas',
 'Brunei',
 'Burma',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Czechia',
 'Diamond Princess',
 'Egypt',
 'Gambia',
 'Holy See',
 'Iran',
 'Korea, South',
 'Kyrgyzstan',
 'Laos',
 'MS Zaandam',
 'Russia',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Vincent and the Grenadines',
 'Slovakia',
 'Syria',
 'Taiwan*',
 'US',
 'Venezuela',
 'Western Sahara',
 'Yemen'}

Get missing data from wikipedia which are not in the dataframe (not recognized as country).

pop_other_sources = { 'Diamond Princess' : 3600, 
                        'Holy See' : 825, 
                        'Taiwan*' : 23_780_000 ,
                        'Western Sahara' : 595_060,
                        'MS Zaandam' :  (1243 + 586) }
pop_other_sources
{'Diamond Princess': 3600,
 'Holy See': 825,
 'Taiwan*': 23780000,
 'Western Sahara': 595060,
 'MS Zaandam': 1829}

For the other countries we generate a dict with the mapping:

hopkins_2_worldbank = { 'Bahamas': 'Bahamas, The', 
                 'Brunei' :  'Brunei Darussalam',  
                 'Burma' : 'Myanmar' ,  
                 'Congo (Brazzaville)': 'Congo, Dem. Rep.' ,
                 'Congo (Kinshasa)':  'Congo, Rep.',
                 'Czechia' :  'Czech Republic', 
                 'Egypt' : 'Egypt, Arab Rep.',
                 'Gambia' :  'Gambia, The',
                 'Iran' : 'Iran, Islamic Rep.',
                 'Korea, South' :'Korea, Rep.',
                 'Kyrgyzstan' : 'Kyrgyz Republic',
                 'Laos' : 'Lao PDR',
                 'Russia' : 'Russian Federation',
                 'Saint Kitts and Nevis' : 'St. Kitts and Nevis',
                 'Saint Lucia' : 'St. Lucia',
                 'Saint Vincent and the Grenadines' :  'St. Vincent and the Grenadines',
                 'Slovakia': 'Slovak Republic',
                 'Syria' :  'Syrian Arab Republic',
                 'US' : 'United States',
                 'Venezuela' :  'Venezuela, RB',
                 'Yemen' : 'Yemen, Rep.'
                } 

And create the inverse mapping with a dict comprehension:

worldbank_2_hopkins = {v: k for k, v in hopkins_2_worldbank.items()}
# create new stripped down dataframe which onyl contains the country name and the population
df_pop = pd.DataFrame(columns=['Country/Region', 'Population'])

Now we iterate over all countries in the UN/Worldbank data and replace the names with the one from the John Hopkins dataset

for c in wb_countries:
    pop_list = df_world_bank[df_world_bank['Country Name'] == c][df_world_bank_pop_columns].dropna(axis = 1).values
    # pop_list is array([]) with pop_list[-1] we get the inner list
    if len(pop_list[-1] > 0):
        if c in worldbank_2_hopkins:
            c = worldbank_2_hopkins[c]
        df_pop.loc[c,'Country/Region'] = c
        df_pop.loc[c,'Population'] = int(pop_list[-1][-1])

Finally, we pop in the data for cruise ships and other countries not recognized by the UN:

for k, v in pop_other_sources.items():
    df_pop.loc[k,'Country/Region'] = k
    df_pop.loc[k,'Population'] = v
    
df_pop.head()
Country/Region Population
Curacao Curacao 159800
Algeria Algeria 42228429
OECD members OECD members 1303529456
Denmark Denmark 5793636
Liechtenstein Liechtenstein 37910

Write it to a csv file and read back to see if the data types are fine.

df_pop.to_csv('data/population.csv')
df = pd.read_csv('data/population.csv', index_col=False)
df.dtypes
Unnamed: 0        object
Country/Region    object
Population         int64
dtype: object